Index: user/alc/PQ_LAUNDRY/contrib/bsnmp/lib/snmp.c
===================================================================
--- user/alc/PQ_LAUNDRY/contrib/bsnmp/lib/snmp.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/contrib/bsnmp/lib/snmp.c	(revision 307896)
@@ -1,1455 +1,1455 @@
 /*
  * Copyright (c) 2001-2003
  *	Fraunhofer Institute for Open Communication Systems (FhG Fokus).
  *	All rights reserved.
  *
  * Author: Harti Brandt <harti@freebsd.org>
  * 
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Shteryana Sotirova Shopova
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $Begemot: bsnmp/lib/snmp.c,v 1.40 2005/10/04 14:32:42 brandt_h Exp $
  *
  * SNMP
  */
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
 #include <stdarg.h>
 #ifdef HAVE_STDINT_H
 #include <stdint.h>
 #elif defined(HAVE_INTTYPES_H)
 #include <inttypes.h>
 #endif
 #include <string.h>
 #include <ctype.h>
 #include <netdb.h>
 #include <errno.h>
 
 #include "asn1.h"
 #include "snmp.h"
 #include "snmppriv.h"
 
 static void snmp_error_func(const char *, ...);
 static void snmp_printf_func(const char *, ...);
 
 void (*snmp_error)(const char *, ...) = snmp_error_func;
 void (*snmp_printf)(const char *, ...) = snmp_printf_func;
 
 /*
  * Get the next variable binding from the list.
  * ASN errors on the sequence or the OID are always fatal.
  */
 static enum asn_err
 get_var_binding(struct asn_buf *b, struct snmp_value *binding)
 {
 	u_char type;
 	asn_len_t len, trailer;
 	enum asn_err err;
 
 	if (asn_get_sequence(b, &len) != ASN_ERR_OK) {
 		snmp_error("cannot parse varbind header");
 		return (ASN_ERR_FAILED);
 	}
 
 	/* temporary truncate the length so that the parser does not
 	 * eat up bytes behind the sequence in the case the encoding is
 	 * wrong of inner elements. */
 	trailer = b->asn_len - len;
 	b->asn_len = len;
 
 	if (asn_get_objid(b, &binding->var) != ASN_ERR_OK) {
 		snmp_error("cannot parse binding objid");
 		return (ASN_ERR_FAILED);
 	}
 	if (asn_get_header(b, &type, &len) != ASN_ERR_OK) {
 		snmp_error("cannot parse binding value header");
 		return (ASN_ERR_FAILED);
 	}
 
 	switch (type) {
 
 	  case ASN_TYPE_NULL:
 		binding->syntax = SNMP_SYNTAX_NULL;
 		err = asn_get_null_raw(b, len);
 		break;
 
 	  case ASN_TYPE_INTEGER:
 		binding->syntax = SNMP_SYNTAX_INTEGER;
 		err = asn_get_integer_raw(b, len, &binding->v.integer);
 		break;
 
 	  case ASN_TYPE_OCTETSTRING:
 		binding->syntax = SNMP_SYNTAX_OCTETSTRING;
 		binding->v.octetstring.octets = malloc(len);
 		if (binding->v.octetstring.octets == NULL) {
 			snmp_error("%s", strerror(errno));
 			return (ASN_ERR_FAILED);
 		}
 		binding->v.octetstring.len = len;
 		err = asn_get_octetstring_raw(b, len,
 		    binding->v.octetstring.octets,
 		    &binding->v.octetstring.len);
 		if (ASN_ERR_STOPPED(err)) {
 			free(binding->v.octetstring.octets);
 			binding->v.octetstring.octets = NULL;
 		}
 		break;
 
 	  case ASN_TYPE_OBJID:
 		binding->syntax = SNMP_SYNTAX_OID;
 		err = asn_get_objid_raw(b, len, &binding->v.oid);
 		break;
 
 	  case ASN_CLASS_APPLICATION|ASN_APP_IPADDRESS:
 		binding->syntax = SNMP_SYNTAX_IPADDRESS;
 		err = asn_get_ipaddress_raw(b, len, binding->v.ipaddress);
 		break;
 
 	  case ASN_CLASS_APPLICATION|ASN_APP_TIMETICKS:
 		binding->syntax = SNMP_SYNTAX_TIMETICKS;
 		err = asn_get_uint32_raw(b, len, &binding->v.uint32);
 		break;
 
 	  case ASN_CLASS_APPLICATION|ASN_APP_COUNTER:
 		binding->syntax = SNMP_SYNTAX_COUNTER;
 		err = asn_get_uint32_raw(b, len, &binding->v.uint32);
 		break;
 
 	  case ASN_CLASS_APPLICATION|ASN_APP_GAUGE:
 		binding->syntax = SNMP_SYNTAX_GAUGE;
 		err = asn_get_uint32_raw(b, len, &binding->v.uint32);
 		break;
 
 	  case ASN_CLASS_APPLICATION|ASN_APP_COUNTER64:
 		binding->syntax = SNMP_SYNTAX_COUNTER64;
 		err = asn_get_counter64_raw(b, len, &binding->v.counter64);
 		break;
 
 	  case ASN_CLASS_CONTEXT | ASN_EXCEPT_NOSUCHOBJECT:
 		binding->syntax = SNMP_SYNTAX_NOSUCHOBJECT;
 		err = asn_get_null_raw(b, len);
 		break;
 
 	  case ASN_CLASS_CONTEXT | ASN_EXCEPT_NOSUCHINSTANCE:
 		binding->syntax = SNMP_SYNTAX_NOSUCHINSTANCE;
 		err = asn_get_null_raw(b, len);
 		break;
 
 	  case ASN_CLASS_CONTEXT | ASN_EXCEPT_ENDOFMIBVIEW:
 		binding->syntax = SNMP_SYNTAX_ENDOFMIBVIEW;
 		err = asn_get_null_raw(b, len);
 		break;
 
 	  default:
 		if ((err = asn_skip(b, len)) == ASN_ERR_OK)
 			err = ASN_ERR_TAG;
 		snmp_error("bad binding value type 0x%x", type);
 		break;
 	}
 
 	if (ASN_ERR_STOPPED(err)) {
 		snmp_error("cannot parse binding value");
 		return (err);
 	}
 
 	if (b->asn_len != 0)
 		snmp_error("ignoring junk at end of binding");
 
 	b->asn_len = trailer;
 
 	return (err);
 }
 
 /*
  * Parse the different PDUs contents. Any ASN error in the outer components
  * are fatal. Only errors in variable values may be tolerated. If all
  * components can be parsed it returns either ASN_ERR_OK or the first
  * error that was found.
  */
 enum asn_err
 snmp_parse_pdus_hdr(struct asn_buf *b, struct snmp_pdu *pdu, asn_len_t *lenp)
 {
 	if (pdu->type == SNMP_PDU_TRAP) {
 		if (asn_get_objid(b, &pdu->enterprise) != ASN_ERR_OK) {
 			snmp_error("cannot parse trap enterprise");
 			return (ASN_ERR_FAILED);
 		}
 		if (asn_get_ipaddress(b, pdu->agent_addr) != ASN_ERR_OK) {
 			snmp_error("cannot parse trap agent address");
 			return (ASN_ERR_FAILED);
 		}
 		if (asn_get_integer(b, &pdu->generic_trap) != ASN_ERR_OK) {
 			snmp_error("cannot parse 'generic-trap'");
 			return (ASN_ERR_FAILED);
 		}
 		if (asn_get_integer(b, &pdu->specific_trap) != ASN_ERR_OK) {
 			snmp_error("cannot parse 'specific-trap'");
 			return (ASN_ERR_FAILED);
 		}
 		if (asn_get_timeticks(b, &pdu->time_stamp) != ASN_ERR_OK) {
 			snmp_error("cannot parse trap 'time-stamp'");
 			return (ASN_ERR_FAILED);
 		}
 	} else {
 		if (asn_get_integer(b, &pdu->request_id) != ASN_ERR_OK) {
 			snmp_error("cannot parse 'request-id'");
 			return (ASN_ERR_FAILED);
 		}
 		if (asn_get_integer(b, &pdu->error_status) != ASN_ERR_OK) {
 			snmp_error("cannot parse 'error_status'");
 			return (ASN_ERR_FAILED);
 		}
 		if (asn_get_integer(b, &pdu->error_index) != ASN_ERR_OK) {
 			snmp_error("cannot parse 'error_index'");
 			return (ASN_ERR_FAILED);
 		}
 	}
 
 	if (asn_get_sequence(b, lenp) != ASN_ERR_OK) {
 		snmp_error("cannot get varlist header");
 		return (ASN_ERR_FAILED);
 	}
 
 	return (ASN_ERR_OK);
 }
 
 static enum asn_err
 parse_pdus(struct asn_buf *b, struct snmp_pdu *pdu, int32_t *ip)
 {
 	asn_len_t len, trailer;
 	struct snmp_value *v;
 	enum asn_err err, err1;
 
 	err = snmp_parse_pdus_hdr(b, pdu, &len);
 	if (ASN_ERR_STOPPED(err))
 		return (err);
 
 	trailer = b->asn_len - len;
 
 	v = pdu->bindings;
 	err = ASN_ERR_OK;
 	while (b->asn_len != 0) {
 		if (pdu->nbindings == SNMP_MAX_BINDINGS) {
 			snmp_error("too many bindings (> %u) in PDU",
 			    SNMP_MAX_BINDINGS);
 			return (ASN_ERR_FAILED);
 		}
 		err1 = get_var_binding(b, v);
 		if (ASN_ERR_STOPPED(err1))
 			return (ASN_ERR_FAILED);
 		if (err1 != ASN_ERR_OK && err == ASN_ERR_OK) {
 			err = err1;
 			*ip = pdu->nbindings + 1;
 		}
 		pdu->nbindings++;
 		v++;
 	}
 
 	b->asn_len = trailer;
 
 	return (err);
 }
 
 
 static enum asn_err
 parse_secparams(struct asn_buf *b, struct snmp_pdu *pdu)
 {
 	asn_len_t octs_len;
 	u_char buf[256]; /* XXX: calc max possible size here */
 	struct asn_buf tb;
 
 	memset(buf, 0, 256);
 	tb.asn_ptr = buf;
 	tb.asn_len = 256;
-	u_int len;
+	u_int len = 256;
 
 	if (asn_get_octetstring(b, buf, &len) != ASN_ERR_OK) {
 		snmp_error("cannot parse usm header");
 		return (ASN_ERR_FAILED);
 	}
 	tb.asn_len = len;
 
 	if (asn_get_sequence(&tb, &octs_len) != ASN_ERR_OK) {
 		snmp_error("cannot decode usm header");
 		return (ASN_ERR_FAILED);
 	}
 
 	octs_len = SNMP_ENGINE_ID_SIZ;
 	if (asn_get_octetstring(&tb, (u_char *)&pdu->engine.engine_id,
 	    &octs_len) != ASN_ERR_OK) {
 		snmp_error("cannot decode msg engine id");
 		return (ASN_ERR_FAILED);
 	}
 	pdu->engine.engine_len = octs_len;
 
 	if (asn_get_integer(&tb, &pdu->engine.engine_boots) != ASN_ERR_OK) {
 		snmp_error("cannot decode msg engine boots");
 		return (ASN_ERR_FAILED);
 	}
 
 	if (asn_get_integer(&tb, &pdu->engine.engine_time) != ASN_ERR_OK) {
 		snmp_error("cannot decode msg engine time");
 		return (ASN_ERR_FAILED);
 	}
 
 	octs_len = SNMP_ADM_STR32_SIZ - 1;
 	if (asn_get_octetstring(&tb, (u_char *)&pdu->user.sec_name, &octs_len)
 	    != ASN_ERR_OK) {
 		snmp_error("cannot decode msg user name");
 		return (ASN_ERR_FAILED);
 	}
 	pdu->user.sec_name[octs_len] = '\0';
 
 	octs_len = sizeof(pdu->msg_digest);
 	if (asn_get_octetstring(&tb, (u_char *)&pdu->msg_digest, &octs_len) !=
 	    ASN_ERR_OK || ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0 &&
 	    octs_len != sizeof(pdu->msg_digest))) {
 		snmp_error("cannot decode msg authentication param");
 		return (ASN_ERR_FAILED);
 	}
 
 	octs_len = sizeof(pdu->msg_salt);
 	if (asn_get_octetstring(&tb, (u_char *)&pdu->msg_salt, &octs_len) !=
 	    ASN_ERR_OK ||((pdu->flags & SNMP_MSG_PRIV_FLAG) != 0 &&
 	    octs_len != sizeof(pdu->msg_salt))) {
 		snmp_error("cannot decode msg authentication param");
 		return (ASN_ERR_FAILED);
 	}
 
 	if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0) {
 		pdu->digest_ptr = b->asn_ptr - SNMP_USM_AUTH_SIZE;
 		pdu->digest_ptr -= octs_len + ASN_MAXLENLEN;
 	}
 
 	return (ASN_ERR_OK);
 }
 
 static enum snmp_code
 pdu_encode_secparams(struct asn_buf *b, struct snmp_pdu *pdu)
 {
 	u_char buf[256], *sptr;
         struct asn_buf tb;
         size_t auth_off, moved = 0;
 
 	auth_off = 0;
 	memset(buf, 0, 256);
 	tb.asn_ptr = buf;
 	tb.asn_len = 256;
 
 	if (asn_put_temp_header(&tb, (ASN_TYPE_SEQUENCE|ASN_TYPE_CONSTRUCTED),
 	    &sptr) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (asn_put_octetstring(&tb, (u_char *)pdu->engine.engine_id,
 	    pdu->engine.engine_len) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (asn_put_integer(&tb, pdu->engine.engine_boots) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (asn_put_integer(&tb, pdu->engine.engine_time) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (asn_put_octetstring(&tb, (u_char *)pdu->user.sec_name,
 	    strlen(pdu->user.sec_name)) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0) {
 		auth_off = sizeof(buf) - tb.asn_len + ASN_MAXLENLEN;
 		if (asn_put_octetstring(&tb, (u_char *)pdu->msg_digest,
 		    sizeof(pdu->msg_digest)) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	} else {
 		if (asn_put_octetstring(&tb, (u_char *)pdu->msg_digest, 0)
 		    != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	}
 
 	if ((pdu->flags & SNMP_MSG_PRIV_FLAG) != 0) {
 		if (asn_put_octetstring(&tb, (u_char *)pdu->msg_salt,
 		    sizeof(pdu->msg_salt)) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	} else {
 		if (asn_put_octetstring(&tb, (u_char *)pdu->msg_salt, 0)
 		    != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	}
 
 	if (asn_commit_header(&tb, sptr, &moved) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0)
 		pdu->digest_ptr = b->asn_ptr + auth_off - moved;
 
 	if (asn_put_octetstring(b, buf, sizeof(buf) - tb.asn_len) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 	pdu->digest_ptr += ASN_MAXLENLEN;
 
 	if ((pdu->flags & SNMP_MSG_PRIV_FLAG) != 0 && asn_put_temp_header(b,
 	    ASN_TYPE_OCTETSTRING, &pdu->encrypted_ptr) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 	return (SNMP_CODE_OK);
 }
 
 /*
  * Decode the PDU except for the variable bindings itself.
  * If decoding fails because of a bad binding, but the rest can be
  * decoded, ip points to the index of the failed variable (errors
  * OORANGE, BADLEN or BADVERS).
  */
 enum snmp_code
 snmp_pdu_decode(struct asn_buf *b, struct snmp_pdu *pdu, int32_t *ip)
 {
 	enum snmp_code code;
 
 	if ((code = snmp_pdu_decode_header(b, pdu)) != SNMP_CODE_OK)
 		return (code);
 
 	if (pdu->version == SNMP_V3) {
 		if (pdu->security_model != SNMP_SECMODEL_USM)
 			return (SNMP_CODE_FAILED);
 		if ((code = snmp_pdu_decode_secmode(b, pdu)) != SNMP_CODE_OK)
 			return (code);
 	}
 
 	code = snmp_pdu_decode_scoped(b, pdu, ip);
 
 	switch (code) {
 	  case SNMP_CODE_FAILED:
 		snmp_pdu_free(pdu);
 		break;
 
 	  case SNMP_CODE_BADENC:
 		if (pdu->version == SNMP_Verr)
 			return (SNMP_CODE_BADVERS);
 
 	  default:
 		break;
 	}
 
 	return (code);
 }
 
 enum snmp_code
 snmp_pdu_decode_header(struct asn_buf *b, struct snmp_pdu *pdu)
 {
 	int32_t version;
 	u_int octs_len;
 	asn_len_t len;
 
 	pdu->outer_ptr = b->asn_ptr;
 	pdu->outer_len = b->asn_len;
 
 	if (asn_get_sequence(b, &len) != ASN_ERR_OK) {
 		snmp_error("cannot decode pdu header");
 		return (SNMP_CODE_FAILED);
 	}
 	if (b->asn_len < len) {
 		snmp_error("outer sequence value too short");
 		return (SNMP_CODE_FAILED);
 	}
 	if (b->asn_len != len) {
 		snmp_error("ignoring trailing junk in message");
 		b->asn_len = len;
 	}
 
 	if (asn_get_integer(b, &version) != ASN_ERR_OK) {
 		snmp_error("cannot decode version");
 		return (SNMP_CODE_FAILED);
 	}
 
 	if (version == 0)
 		pdu->version = SNMP_V1;
 	else if (version == 1)
 		pdu->version = SNMP_V2c;
 	else if (version == 3)
 		pdu->version = SNMP_V3;
 	else {
 		pdu->version = SNMP_Verr;
 		snmp_error("unsupported SNMP version");
 		return (SNMP_CODE_BADENC);
 	}
 
 	if (pdu->version == SNMP_V3) {
 		if (asn_get_sequence(b, &len) != ASN_ERR_OK) {
 			snmp_error("cannot decode pdu global data header");
 			return (SNMP_CODE_FAILED);
 		}
 
 		if (asn_get_integer(b, &pdu->identifier) != ASN_ERR_OK) {
 			snmp_error("cannot decode msg indetifier");
 			return (SNMP_CODE_FAILED);
 		}
 
 		if (asn_get_integer(b, &pdu->engine.max_msg_size)
 		    != ASN_ERR_OK) {
 			snmp_error("cannot decode msg size");
 			return (SNMP_CODE_FAILED);
 		}
 
 		octs_len = 1;
 		if (asn_get_octetstring(b, (u_char *)&pdu->flags,
 		    &octs_len) != ASN_ERR_OK) {
 			snmp_error("cannot decode msg flags");
 			return (SNMP_CODE_FAILED);
 		}
 
 		if (asn_get_integer(b, &pdu->security_model) != ASN_ERR_OK) {
 			snmp_error("cannot decode msg size");
 			return (SNMP_CODE_FAILED);
 		}
 
 		if (pdu->security_model != SNMP_SECMODEL_USM)
 			return (SNMP_CODE_FAILED);
 
 		if (parse_secparams(b, pdu) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	} else {
 		octs_len = SNMP_COMMUNITY_MAXLEN;
 		if (asn_get_octetstring(b, (u_char *)pdu->community,
 		    &octs_len) != ASN_ERR_OK) {
 			snmp_error("cannot decode community");
 			return (SNMP_CODE_FAILED);
 		}
 		pdu->community[octs_len] = '\0';
 	}
 
 	return (SNMP_CODE_OK);
 }
 
 enum snmp_code
 snmp_pdu_decode_scoped(struct asn_buf *b, struct snmp_pdu *pdu, int32_t *ip)
 {
 	u_char type;
 	asn_len_t len, trailer;
 	enum asn_err err;
 
 	if (pdu->version == SNMP_V3) {
 		if (asn_get_sequence(b, &len) != ASN_ERR_OK) {
 			snmp_error("cannot decode scoped pdu header");
 			return (SNMP_CODE_FAILED);
 		}
 
 		len = SNMP_ENGINE_ID_SIZ;
 		if (asn_get_octetstring(b, (u_char *)&pdu->context_engine,
 		    &len) != ASN_ERR_OK) {
 			snmp_error("cannot decode msg context engine");
 			return (SNMP_CODE_FAILED);
 		}
 		pdu->context_engine_len = len;
 
 		len = SNMP_CONTEXT_NAME_SIZ;
 		if (asn_get_octetstring(b, (u_char *)&pdu->context_name,
 		    &len) != ASN_ERR_OK) {
 			snmp_error("cannot decode msg context name");
 			return (SNMP_CODE_FAILED);
 		}
 		pdu->context_name[len] = '\0';
 	}
 
 	if (asn_get_header(b, &type, &len) != ASN_ERR_OK) {
 		snmp_error("cannot get pdu header");
 		return (SNMP_CODE_FAILED);
 	}
 	if ((type & ~ASN_TYPE_MASK) !=
 	    (ASN_TYPE_CONSTRUCTED | ASN_CLASS_CONTEXT)) {
 		snmp_error("bad pdu header tag");
 		return (SNMP_CODE_FAILED);
 	}
 	pdu->type = type & ASN_TYPE_MASK;
 
 	switch (pdu->type) {
 
 	  case SNMP_PDU_GET:
 	  case SNMP_PDU_GETNEXT:
 	  case SNMP_PDU_RESPONSE:
 	  case SNMP_PDU_SET:
 		break;
 
 	  case SNMP_PDU_TRAP:
 		if (pdu->version != SNMP_V1) {
 			snmp_error("bad pdu type %u", pdu->type);
 			return (SNMP_CODE_FAILED);
 		}
 		break;
 
 	  case SNMP_PDU_GETBULK:
 	  case SNMP_PDU_INFORM:
 	  case SNMP_PDU_TRAP2:
 	  case SNMP_PDU_REPORT:
 		if (pdu->version == SNMP_V1) {
 			snmp_error("bad pdu type %u", pdu->type);
 			return (SNMP_CODE_FAILED);
 		}
 		break;
 
 	  default:
 		snmp_error("bad pdu type %u", pdu->type);
 		return (SNMP_CODE_FAILED);
 	}
 
 	trailer = b->asn_len - len;
 	b->asn_len = len;
 
 	err = parse_pdus(b, pdu, ip);
 	if (ASN_ERR_STOPPED(err))
 		return (SNMP_CODE_FAILED);
 
 	if (b->asn_len != 0)
 		snmp_error("ignoring trailing junk after pdu");
 
 	b->asn_len = trailer;
 
 	return (SNMP_CODE_OK);
 }
 
 enum snmp_code
 snmp_pdu_decode_secmode(struct asn_buf *b, struct snmp_pdu *pdu)
 {
 	u_char type;
 	enum snmp_code code;
 	uint8_t	digest[SNMP_USM_AUTH_SIZE];
 
 	if (pdu->user.auth_proto != SNMP_AUTH_NOAUTH &&
 	    (pdu->flags & SNMP_MSG_AUTH_FLAG) == 0)
 		return (SNMP_CODE_BADSECLEVEL);
 
 	if ((code = snmp_pdu_calc_digest(pdu, digest)) !=
 	    SNMP_CODE_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (pdu->user.auth_proto != SNMP_AUTH_NOAUTH &&
 	    memcmp(digest, pdu->msg_digest, sizeof(pdu->msg_digest)) != 0)
 		return (SNMP_CODE_BADDIGEST);
 
 	if (pdu->user.priv_proto != SNMP_PRIV_NOPRIV && (asn_get_header(b, &type,
 	    &pdu->scoped_len) != ASN_ERR_OK || type != ASN_TYPE_OCTETSTRING)) {
 		snmp_error("cannot decode encrypted pdu");
 		return (SNMP_CODE_FAILED);
 	}
 	pdu->scoped_ptr = b->asn_ptr;
 
 	if (pdu->user.priv_proto != SNMP_PRIV_NOPRIV &&
 	    (pdu->flags & SNMP_MSG_PRIV_FLAG) == 0)
 		return (SNMP_CODE_BADSECLEVEL);
 
 	if ((code = snmp_pdu_decrypt(pdu)) != SNMP_CODE_OK)
 		return (SNMP_CODE_FAILED);
 
 	return (code);
 }
 
 /*
  * Check whether what we have is the complete PDU by snooping at the
  * enclosing structure header. This returns:
  *   -1		if there are ASN.1 errors
  *    0		if we need more data
  *  > 0		the length of this PDU
  */
 int
 snmp_pdu_snoop(const struct asn_buf *b0)
 {
 	u_int length;
 	asn_len_t len;
 	struct asn_buf b = *b0;
 
 	/* <0x10|0x20> <len> <data...> */
 	
 	if (b.asn_len == 0)
 		return (0);
 	if (b.asn_cptr[0] != (ASN_TYPE_SEQUENCE | ASN_TYPE_CONSTRUCTED)) {
 		asn_error(&b, "bad sequence type %u", b.asn_cptr[0]);
 		return (-1);
 	}
 	b.asn_len--;
 	b.asn_cptr++;
 
 	if (b.asn_len == 0)
 		return (0);
 
 	if (*b.asn_cptr & 0x80) {
 		/* long length */
 		length = *b.asn_cptr++ & 0x7f;
 		b.asn_len--;
 		if (length == 0) {
 			asn_error(&b, "indefinite length not supported");
 			return (-1);
 		}
 		if (length > ASN_MAXLENLEN) {
 			asn_error(&b, "long length too long (%u)", length);
 			return (-1);
 		}
 		if (length > b.asn_len)
 			return (0);
 		len = 0;
 		while (length--) {
 			len = (len << 8) | *b.asn_cptr++;
 			b.asn_len--;
 		}
 	} else {
 		len = *b.asn_cptr++;
 		b.asn_len--;
 	}
 
 	if (len > b.asn_len)
 		return (0);
 
 	return (len + b.asn_cptr - b0->asn_cptr);
 }
 
 /*
  * Encode the SNMP PDU without the variable bindings field.
  * We do this the rather uneffective way by
  * moving things around and assuming that the length field will never
  * use more than 2 bytes.
  * We need a number of pointers to apply the fixes afterwards.
  */
 enum snmp_code
 snmp_pdu_encode_header(struct asn_buf *b, struct snmp_pdu *pdu)
 {
 	enum asn_err err;
 	u_char *v3_hdr_ptr;
 
 	if (asn_put_temp_header(b, (ASN_TYPE_SEQUENCE|ASN_TYPE_CONSTRUCTED),
 	    &pdu->outer_ptr) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (pdu->version == SNMP_V1)
 		err = asn_put_integer(b, 0);
 	else if (pdu->version == SNMP_V2c)
 		err = asn_put_integer(b, 1);
 	else if (pdu->version == SNMP_V3)
 		err = asn_put_integer(b, 3);
 	else
 		return (SNMP_CODE_BADVERS);
 	if (err != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (pdu->version == SNMP_V3) {
 		if (asn_put_temp_header(b, (ASN_TYPE_SEQUENCE |
 		    ASN_TYPE_CONSTRUCTED), &v3_hdr_ptr) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	
 		if (asn_put_integer(b, pdu->identifier) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (asn_put_integer(b, pdu->engine.max_msg_size) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (pdu->type != SNMP_PDU_RESPONSE &&
 		    pdu->type != SNMP_PDU_TRAP &&
 		    pdu->type != SNMP_PDU_TRAP2 &&
 		    pdu->type != SNMP_PDU_REPORT)
 			pdu->flags |= SNMP_MSG_REPORT_FLAG;
 
 		if (asn_put_octetstring(b, (u_char *)&pdu->flags, 1)
 		    != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (asn_put_integer(b, pdu->security_model) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (asn_commit_header(b, v3_hdr_ptr, NULL) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (pdu->security_model != SNMP_SECMODEL_USM)
 			return (SNMP_CODE_FAILED);
 
 		if (pdu_encode_secparams(b, pdu) != SNMP_CODE_OK)
 			return (SNMP_CODE_FAILED);
 
 		/*  View-based Access Conntrol information */
 		if (asn_put_temp_header(b, (ASN_TYPE_SEQUENCE |
 		    ASN_TYPE_CONSTRUCTED), &pdu->scoped_ptr) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (asn_put_octetstring(b, (u_char *)pdu->context_engine,
 		    pdu->context_engine_len) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (asn_put_octetstring(b, (u_char *)pdu->context_name,
 		    strlen(pdu->context_name)) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	} else {
 		if (asn_put_octetstring(b, (u_char *)pdu->community,
 		    strlen(pdu->community)) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	}
 
 	if (asn_put_temp_header(b, (ASN_TYPE_CONSTRUCTED | ASN_CLASS_CONTEXT |
 	    pdu->type), &pdu->pdu_ptr) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (pdu->type == SNMP_PDU_TRAP) {
 		if (pdu->version != SNMP_V1 ||
 		    asn_put_objid(b, &pdu->enterprise) != ASN_ERR_OK ||
 		    asn_put_ipaddress(b, pdu->agent_addr) != ASN_ERR_OK ||
 		    asn_put_integer(b, pdu->generic_trap) != ASN_ERR_OK ||
 		    asn_put_integer(b, pdu->specific_trap) != ASN_ERR_OK ||
 		    asn_put_timeticks(b, pdu->time_stamp) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	} else {
 		if (pdu->version == SNMP_V1 && (pdu->type == SNMP_PDU_GETBULK ||
 		    pdu->type == SNMP_PDU_INFORM ||
 		    pdu->type == SNMP_PDU_TRAP2 ||
 		    pdu->type == SNMP_PDU_REPORT))
 			return (SNMP_CODE_FAILED);
 
 		if (asn_put_integer(b, pdu->request_id) != ASN_ERR_OK ||
 		    asn_put_integer(b, pdu->error_status) != ASN_ERR_OK ||
 		    asn_put_integer(b, pdu->error_index) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	}
 
 	if (asn_put_temp_header(b, (ASN_TYPE_SEQUENCE|ASN_TYPE_CONSTRUCTED),
 	    &pdu->vars_ptr) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	return (SNMP_CODE_OK);
 }
 
 static enum asn_err
 snmp_pdu_fix_padd(struct asn_buf *b, struct snmp_pdu *pdu)
 {
 	asn_len_t padlen;
 
 	if (pdu->user.priv_proto == SNMP_PRIV_DES && pdu->scoped_len % 8 != 0) {
 		padlen = 8 - (pdu->scoped_len % 8);
 		if (asn_pad(b, padlen) != ASN_ERR_OK)
 			return (ASN_ERR_FAILED);
 		pdu->scoped_len += padlen;
 	}
 
 	return (ASN_ERR_OK);
 }
 
 enum snmp_code
 snmp_fix_encoding(struct asn_buf *b, struct snmp_pdu *pdu)
 {
 	size_t moved = 0;
 	enum snmp_code code;
 
 	if (asn_commit_header(b, pdu->vars_ptr, NULL) != ASN_ERR_OK ||
 	    asn_commit_header(b, pdu->pdu_ptr, NULL) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	if (pdu->version == SNMP_V3) {
 		if (asn_commit_header(b, pdu->scoped_ptr, NULL) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		pdu->scoped_len = b->asn_ptr - pdu->scoped_ptr;
 		if (snmp_pdu_fix_padd(b, pdu) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (pdu->security_model != SNMP_SECMODEL_USM)
 			return (SNMP_CODE_FAILED);
 
 		if (snmp_pdu_encrypt(pdu) != SNMP_CODE_OK)
 			return (SNMP_CODE_FAILED);
 
 		if (pdu->user.priv_proto != SNMP_PRIV_NOPRIV &&
 		    asn_commit_header(b, pdu->encrypted_ptr, NULL) != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 	}
 
 	if (asn_commit_header(b, pdu->outer_ptr, &moved) != ASN_ERR_OK)
 		return (SNMP_CODE_FAILED);
 
 	pdu->outer_len = b->asn_ptr - pdu->outer_ptr;
 	pdu->digest_ptr -= moved;
 
 	if (pdu->version == SNMP_V3) {
 		if ((code = snmp_pdu_calc_digest(pdu, pdu->msg_digest)) !=
 		    SNMP_CODE_OK)
 			return (SNMP_CODE_FAILED);
 
 		if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0)
 			memcpy(pdu->digest_ptr, pdu->msg_digest,
 			    sizeof(pdu->msg_digest));
 	}
 
 	return (SNMP_CODE_OK);
 }
 
 /*
  * Encode a binding. Caller must ensure, that the syntax is ok for that version.
  * Be sure not to cobber b, when something fails.
  */
 enum asn_err
 snmp_binding_encode(struct asn_buf *b, const struct snmp_value *binding)
 {
 	u_char *ptr;
 	enum asn_err err;
 	struct asn_buf save = *b;
 
 	if ((err = asn_put_temp_header(b, (ASN_TYPE_SEQUENCE |
 	    ASN_TYPE_CONSTRUCTED), &ptr)) != ASN_ERR_OK) {
 		*b = save;
 		return (err);
 	}
 
 	if ((err = asn_put_objid(b, &binding->var)) != ASN_ERR_OK) {
 		*b = save;
 		return (err);
 	}
 
 	switch (binding->syntax) {
 
 	  case SNMP_SYNTAX_NULL:
 		err = asn_put_null(b);
 		break;
 
 	  case SNMP_SYNTAX_INTEGER:
 		err = asn_put_integer(b, binding->v.integer);
 		break;
 
 	  case SNMP_SYNTAX_OCTETSTRING:
 		err = asn_put_octetstring(b, binding->v.octetstring.octets,
 		    binding->v.octetstring.len);
 		break;
 
 	  case SNMP_SYNTAX_OID:
 		err = asn_put_objid(b, &binding->v.oid);
 		break;
 
 	  case SNMP_SYNTAX_IPADDRESS:
 		err = asn_put_ipaddress(b, binding->v.ipaddress);
 		break;
 
 	  case SNMP_SYNTAX_TIMETICKS:
 		err = asn_put_uint32(b, ASN_APP_TIMETICKS, binding->v.uint32);
 		break;
 
 	  case SNMP_SYNTAX_COUNTER:
 		err = asn_put_uint32(b, ASN_APP_COUNTER, binding->v.uint32);
 		break;
 
 	  case SNMP_SYNTAX_GAUGE:
 		err = asn_put_uint32(b, ASN_APP_GAUGE, binding->v.uint32);
 		break;
 
 	  case SNMP_SYNTAX_COUNTER64:
 		err = asn_put_counter64(b, binding->v.counter64);
 		break;
 
 	  case SNMP_SYNTAX_NOSUCHOBJECT:
 		err = asn_put_exception(b, ASN_EXCEPT_NOSUCHOBJECT);
 		break;
 
 	  case SNMP_SYNTAX_NOSUCHINSTANCE:
 		err = asn_put_exception(b, ASN_EXCEPT_NOSUCHINSTANCE);
 		break;
 
 	  case SNMP_SYNTAX_ENDOFMIBVIEW:
 		err = asn_put_exception(b, ASN_EXCEPT_ENDOFMIBVIEW);
 		break;
 	}
 
 	if (err != ASN_ERR_OK) {
 		*b = save;
 		return (err);
 	}
 
 	err = asn_commit_header(b, ptr, NULL);
 	if (err != ASN_ERR_OK) {
 		*b = save;
 		return (err);
 	}
 
 	return (ASN_ERR_OK);
 }
 
 /*
  * Encode an PDU.
  */
 enum snmp_code
 snmp_pdu_encode(struct snmp_pdu *pdu, struct asn_buf *resp_b)
 {
 	u_int idx;
 	enum snmp_code err;
 
 	if ((err = snmp_pdu_encode_header(resp_b, pdu)) != SNMP_CODE_OK)
 		return (err);
 	for (idx = 0; idx < pdu->nbindings; idx++)
 		if (snmp_binding_encode(resp_b, &pdu->bindings[idx])
 		    != ASN_ERR_OK)
 			return (SNMP_CODE_FAILED);
 
 	return (snmp_fix_encoding(resp_b, pdu));
 }
 
 static void
 dump_binding(const struct snmp_value *b)
 {
 	u_int i;
 	char buf[ASN_OIDSTRLEN];
 
 	snmp_printf("%s=", asn_oid2str_r(&b->var, buf));
 	switch (b->syntax) {
 
 	  case SNMP_SYNTAX_NULL:
 		snmp_printf("NULL");
 		break;
 
 	  case SNMP_SYNTAX_INTEGER:
 		snmp_printf("INTEGER %d", b->v.integer);
 		break;
 
 	  case SNMP_SYNTAX_OCTETSTRING:
 		snmp_printf("OCTET STRING %lu:", b->v.octetstring.len);
 		for (i = 0; i < b->v.octetstring.len; i++)
 			snmp_printf(" %02x", b->v.octetstring.octets[i]);
 		break;
 
 	  case SNMP_SYNTAX_OID:
 		snmp_printf("OID %s", asn_oid2str_r(&b->v.oid, buf));
 		break;
 
 	  case SNMP_SYNTAX_IPADDRESS:
 		snmp_printf("IPADDRESS %u.%u.%u.%u", b->v.ipaddress[0],
 		    b->v.ipaddress[1], b->v.ipaddress[2], b->v.ipaddress[3]);
 		break;
 
 	  case SNMP_SYNTAX_COUNTER:
 		snmp_printf("COUNTER %u", b->v.uint32);
 		break;
 
 	  case SNMP_SYNTAX_GAUGE:
 		snmp_printf("GAUGE %u", b->v.uint32);
 		break;
 
 	  case SNMP_SYNTAX_TIMETICKS:
 		snmp_printf("TIMETICKS %u", b->v.uint32);
 		break;
 
 	  case SNMP_SYNTAX_COUNTER64:
 		snmp_printf("COUNTER64 %lld", b->v.counter64);
 		break;
 
 	  case SNMP_SYNTAX_NOSUCHOBJECT:
 		snmp_printf("NoSuchObject");
 		break;
 
 	  case SNMP_SYNTAX_NOSUCHINSTANCE:
 		snmp_printf("NoSuchInstance");
 		break;
 
 	  case SNMP_SYNTAX_ENDOFMIBVIEW:
 		snmp_printf("EndOfMibView");
 		break;
 
 	  default:
 		snmp_printf("UNKNOWN SYNTAX %u", b->syntax);
 		break;
 	}
 }
 
 static __inline void
 dump_bindings(const struct snmp_pdu *pdu)
 {
 	u_int i;
 
 	for (i = 0; i < pdu->nbindings; i++) {
 		snmp_printf(" [%u]: ", i);
 		dump_binding(&pdu->bindings[i]);
 		snmp_printf("\n");
 	}
 }
 
 static __inline void
 dump_notrap(const struct snmp_pdu *pdu)
 {
 	snmp_printf(" request_id=%d", pdu->request_id);
 	snmp_printf(" error_status=%d", pdu->error_status);
 	snmp_printf(" error_index=%d\n", pdu->error_index);
 	dump_bindings(pdu);
 }
 
 void
 snmp_pdu_dump(const struct snmp_pdu *pdu)
 {
 	char buf[ASN_OIDSTRLEN];
 	const char *vers;
 	static const char *types[] = {
 		[SNMP_PDU_GET] =	"GET",
 		[SNMP_PDU_GETNEXT] =	"GETNEXT",
 		[SNMP_PDU_RESPONSE] =	"RESPONSE",
 		[SNMP_PDU_SET] =	"SET",
 		[SNMP_PDU_TRAP] =	"TRAPv1",
 		[SNMP_PDU_GETBULK] =	"GETBULK",
 		[SNMP_PDU_INFORM] =	"INFORM",
 		[SNMP_PDU_TRAP2] =	"TRAPv2",
 		[SNMP_PDU_REPORT] =	"REPORT",
 	};
 
 	if (pdu->version == SNMP_V1)
 		vers = "SNMPv1";
 	else if (pdu->version == SNMP_V2c)
 		vers = "SNMPv2c";
 	else if (pdu->version == SNMP_V3)
 		vers = "SNMPv3";
 	else
 		vers = "v?";
 
 	switch (pdu->type) {
 	  case SNMP_PDU_TRAP:
 		snmp_printf("%s %s '%s'", types[pdu->type], vers, pdu->community);
 		snmp_printf(" enterprise=%s", asn_oid2str_r(&pdu->enterprise, buf));
 		snmp_printf(" agent_addr=%u.%u.%u.%u", pdu->agent_addr[0],
 		    pdu->agent_addr[1], pdu->agent_addr[2], pdu->agent_addr[3]);
 		snmp_printf(" generic_trap=%d", pdu->generic_trap);
 		snmp_printf(" specific_trap=%d", pdu->specific_trap);
 		snmp_printf(" time-stamp=%u\n", pdu->time_stamp);
 		dump_bindings(pdu);
 		break;
 
 	  case SNMP_PDU_GET:
 	  case SNMP_PDU_GETNEXT:
 	  case SNMP_PDU_RESPONSE:
 	  case SNMP_PDU_SET:
 	  case SNMP_PDU_GETBULK:
 	  case SNMP_PDU_INFORM:
 	  case SNMP_PDU_TRAP2:
 	  case SNMP_PDU_REPORT:
 		snmp_printf("%s %s '%s'", types[pdu->type], vers, pdu->community);
 		dump_notrap(pdu);
 		break;
 
 	  default:
 		snmp_printf("bad pdu type %u\n", pdu->type);
 		break;
 	}
 }
 
 void
 snmp_value_free(struct snmp_value *value)
 {
 	if (value->syntax == SNMP_SYNTAX_OCTETSTRING)
 		free(value->v.octetstring.octets);
 	value->syntax = SNMP_SYNTAX_NULL;
 }
 
 int
 snmp_value_copy(struct snmp_value *to, const struct snmp_value *from)
 {
 	to->var = from->var;
 	to->syntax = from->syntax;
 
 	if (from->syntax == SNMP_SYNTAX_OCTETSTRING) {
 		if ((to->v.octetstring.len = from->v.octetstring.len) == 0)
 			to->v.octetstring.octets = NULL;
 		else {
 			to->v.octetstring.octets = malloc(to->v.octetstring.len);
 			if (to->v.octetstring.octets == NULL)
 				return (-1);
 			(void)memcpy(to->v.octetstring.octets,
 			    from->v.octetstring.octets, to->v.octetstring.len);
 		}
 	} else
 		to->v = from->v;
 	return (0);
 }
 
 void
 snmp_pdu_init_secparams(struct snmp_pdu *pdu)
 {
 	int32_t rval;
 
 	if (pdu->user.auth_proto != SNMP_AUTH_NOAUTH)
 		pdu->flags |= SNMP_MSG_AUTH_FLAG;
 
 	switch (pdu->user.priv_proto) {
 	case SNMP_PRIV_DES:
 		memcpy(pdu->msg_salt, &pdu->engine.engine_boots,
 		    sizeof(pdu->engine.engine_boots));
 		rval = random();
 		memcpy(pdu->msg_salt + sizeof(pdu->engine.engine_boots), &rval,
 		    sizeof(int32_t));
 		pdu->flags |= SNMP_MSG_PRIV_FLAG;
 		break;
 	case SNMP_PRIV_AES:
 		rval = random();
 		memcpy(pdu->msg_salt, &rval, sizeof(int32_t));
 		rval = random();
 		memcpy(pdu->msg_salt + sizeof(int32_t), &rval, sizeof(int32_t));
 		pdu->flags |= SNMP_MSG_PRIV_FLAG;
 		break;
 	default:
 		break;
 	}
 }
 
 void
 snmp_pdu_free(struct snmp_pdu *pdu)
 {
 	u_int i;
 
 	for (i = 0; i < pdu->nbindings; i++)
 		snmp_value_free(&pdu->bindings[i]);
 }
 
 /*
  * Parse an ASCII SNMP value into the binary form
  */
 int
 snmp_value_parse(const char *str, enum snmp_syntax syntax, union snmp_values *v)
 {
 	char *end;
 
 	switch (syntax) {
 
 	  case SNMP_SYNTAX_NULL:
 	  case SNMP_SYNTAX_NOSUCHOBJECT:
 	  case SNMP_SYNTAX_NOSUCHINSTANCE:
 	  case SNMP_SYNTAX_ENDOFMIBVIEW:
 		if (*str != '\0')
 			return (-1);
 		return (0);
 
 	  case SNMP_SYNTAX_INTEGER:
 		v->integer = strtoll(str, &end, 0);
 		if (*end != '\0')
 			return (-1);
 		return (0);
 
 	  case SNMP_SYNTAX_OCTETSTRING:
 	    {
 		u_long len;	/* actual length of string */
 		u_long alloc;	/* allocate length of string */
 		u_char *octs;	/* actual octets */
 		u_long oct;	/* actual octet */
 		u_char *nocts;	/* to avoid memory leak */
 		u_char c;	/* actual character */
 
 # define STUFFC(C)							\
 		if (alloc == len) {					\
 			alloc += 100;					\
 			if ((nocts = realloc(octs, alloc)) == NULL) {	\
 				free(octs);				\
 				return (-1);				\
 			}						\
 			octs = nocts;					\
 		}							\
 		octs[len++] = (C);
 
 		len = alloc = 0;
 		octs = NULL;
 
 		if (*str == '"') {
 			str++;
 			while((c = *str++) != '\0') {
 				if (c == '"') {
 					if (*str != '\0') {
 						free(octs);
 						return (-1);
 					}
 					break;
 				}
 				if (c == '\\') {
 					switch (c = *str++) {
 
 					  case '\\':
 						break;
 					  case 'a':
 						c = '\a';
 						break;
 					  case 'b':
 						c = '\b';
 						break;
 					  case 'f':
 						c = '\f';
 						break;
 					  case 'n':
 						c = '\n';
 						break;
 					  case 'r':
 						c = '\r';
 						break;
 					  case 't':
 						c = '\t';
 						break;
 					  case 'v':
 						c = '\v';
 						break;
 					  case 'x':
 						c = 0;
 						if (!isxdigit(*str))
 							break;
 						if (isdigit(*str))
 							c = *str++ - '0';
 						else if (isupper(*str))
 							c = *str++ - 'A' + 10;
 						else
 							c = *str++ - 'a' + 10;
 						if (!isxdigit(*str))
 							break;
 						if (isdigit(*str))
 							c += *str++ - '0';
 						else if (isupper(*str))
 							c += *str++ - 'A' + 10;
 						else
 							c += *str++ - 'a' + 10;
 						break;
 					  case '0': case '1': case '2':
 					  case '3': case '4': case '5':
 					  case '6': case '7':
 						c = *str++ - '0';
 						if (*str < '0' || *str > '7')
 							break;
 						c = *str++ - '0';
 						if (*str < '0' || *str > '7')
 							break;
 						c = *str++ - '0';
 						break;
 					  default:
 						break;
 					}
 				}
 				STUFFC(c);
 			}
 		} else {
 			while (*str != '\0') {
 				oct = strtoul(str, &end, 16);
 				str = end;
 				if (oct > 0xff) {
 					free(octs);
 					return (-1);
 				}
 				STUFFC(oct);
 				if (*str == ':')
 					str++;
 				else if(*str != '\0') {
 					free(octs);
 					return (-1);
 				}
 			}
 		}
 		v->octetstring.octets = octs;
 		v->octetstring.len = len;
 		return (0);
 # undef STUFFC
 	    }
 
 	  case SNMP_SYNTAX_OID:
 	    {
 		u_long subid;
 
 		v->oid.len = 0;
 
 		for (;;) {
 			if (v->oid.len == ASN_MAXOIDLEN)
 				return (-1);
 			subid = strtoul(str, &end, 10);
 			str = end;
 			if (subid > ASN_MAXID)
 				return (-1);
 			v->oid.subs[v->oid.len++] = (asn_subid_t)subid;
 			if (*str == '\0')
 				break;
 			if (*str != '.')
 				return (-1);
 			str++;
 		}
 		return (0);
 	    }
 
 	  case SNMP_SYNTAX_IPADDRESS:
 	    {
 		struct hostent *he;
 		u_long ip[4];
 		int n;
 
 		if (sscanf(str, "%lu.%lu.%lu.%lu%n", &ip[0], &ip[1], &ip[2],
 		    &ip[3], &n) == 4 && (size_t)n == strlen(str) &&
 		    ip[0] <= 0xff && ip[1] <= 0xff &&
 		    ip[2] <= 0xff && ip[3] <= 0xff) {
 			v->ipaddress[0] = (u_char)ip[0];
 			v->ipaddress[1] = (u_char)ip[1];
 			v->ipaddress[2] = (u_char)ip[2];
 			v->ipaddress[3] = (u_char)ip[3];
 			return (0);
 		}
 
 		if ((he = gethostbyname(str)) == NULL)
 			return (-1);
 		if (he->h_addrtype != AF_INET)
 			return (-1);
 
 		v->ipaddress[0] = he->h_addr[0];
 		v->ipaddress[1] = he->h_addr[1];
 		v->ipaddress[2] = he->h_addr[2];
 		v->ipaddress[3] = he->h_addr[3];
 		return (0);
 	    }
 
 	  case SNMP_SYNTAX_COUNTER:
 	  case SNMP_SYNTAX_GAUGE:
 	  case SNMP_SYNTAX_TIMETICKS:
 	    {
 		uint64_t sub;
 
 		sub = strtoull(str, &end, 0);
 		if (*end != '\0' || sub > 0xffffffff)
 			return (-1);
 		v->uint32 = (uint32_t)sub;
 		return (0);
 	    }
 
 	  case SNMP_SYNTAX_COUNTER64:
 		v->counter64 = strtoull(str, &end, 0);
 		if (*end != '\0')
 			return (-1);
 		return (0);
 	}
 	abort();
 }
 
 static void
 snmp_error_func(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	fprintf(stderr, "SNMP: ");
 	vfprintf(stderr, fmt, ap);
 	fprintf(stderr, "\n");
 	va_end(ap);
 }
 
 static void
 snmp_printf_func(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vfprintf(stderr, fmt, ap);
 	va_end(ap);
 }
Index: user/alc/PQ_LAUNDRY/lib/libc/powerpc/gen/Makefile.inc
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libc/powerpc/gen/Makefile.inc	(revision 307895)
+++ user/alc/PQ_LAUNDRY/lib/libc/powerpc/gen/Makefile.inc	(revision 307896)
@@ -1,7 +1,7 @@
 # $FreeBSD$
 
-.include "${LIBC_SRC}/powerpc/gen/Makefile.common"
+.include "${LIBC_SRCTOP}/powerpc/gen/Makefile.common"
 
 SRCS += fabs.S flt_rounds.c fpgetmask.c fpgetround.c \
 	fpgetsticky.c fpsetmask.c fpsetround.c \
 	_setjmp.S setjmp.S sigsetjmp.S
Index: user/alc/PQ_LAUNDRY/lib/libnetbsd/sys/cdefs.h
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libnetbsd/sys/cdefs.h	(revision 307895)
+++ user/alc/PQ_LAUNDRY/lib/libnetbsd/sys/cdefs.h	(revision 307896)
@@ -1,72 +1,74 @@
 /* $FreeBSD$ */
 
 /*-
  * Copyright (c) 2012 SRI International
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _LIBNETBSD_SYS_CDEFS_H_
 #define _LIBNETBSD_SYS_CDEFS_H_
 
 #include_next <sys/cdefs.h>
 
+#ifndef __dead
 #ifdef __dead2
 #define __dead __dead2
 #else
 #define __dead
 #endif
+#endif /* !__dead */
 
 /*
  * The __CONCAT macro is used to concatenate parts of symbol names, e.g.
  * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo.
  * The __CONCAT macro is a bit tricky -- make sure you don't put spaces
  * in between its arguments.  __CONCAT can also concatenate double-quoted
  * strings produced by the __STRING macro, but this only works with ANSI C.
  */
 
 #define	___STRING(x)	__STRING(x)
 #define	___CONCAT(x,y)	__CONCAT(x,y)
 
 /*
  * The following macro is used to remove const cast-away warnings
  * from gcc -Wcast-qual; it should be used with caution because it
  * can hide valid errors; in particular most valid uses are in
  * situations where the API requires it, not to cast away string
  * constants. We don't use *intptr_t on purpose here and we are
  * explicit about unsigned long so that we don't have additional
  * dependencies.
  */
 #define __UNCONST(a)	((void *)(unsigned long)(const void *)(a))
 
 /*
  * Return the number of elements in a statically-allocated array,
  * __x.
  */
 #define	__arraycount(__x)	(sizeof(__x) / sizeof(__x[0]))
 
 #endif /* _LIBNETBSD_SYS_CDEFS_H_ */
Index: user/alc/PQ_LAUNDRY/lib/libnetbsd/util.c
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libnetbsd/util.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/lib/libnetbsd/util.c	(revision 307896)
@@ -1,59 +1,60 @@
 /*-
  * Copyright (c) 2012 SRI International
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 #include <sys/types.h>
 
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
-#include <util.h>
+
+#include "util.h"
 
 char *
 flags_to_string(u_long flags, const char *def)
 {
 	char *str;
 
 	str = fflagstostr(flags);
 	if (*str == '\0') {
 		free(str);
 		str = strdup(def);
 	}
 	return (str);
 }
 
 int
 string_to_flags(char **stringp, u_long *setp, u_long *clrp)
 {
 
 	return strtofflags(stringp, setp, clrp);
 }
Index: user/alc/PQ_LAUNDRY/release/tools/arm.subr
===================================================================
--- user/alc/PQ_LAUNDRY/release/tools/arm.subr	(revision 307895)
+++ user/alc/PQ_LAUNDRY/release/tools/arm.subr	(revision 307896)
@@ -1,137 +1,136 @@
 #!/bin/sh
 #-
 # Copyright (c) 2015 The FreeBSD Foundation
 # All rights reserved.
 #
 # Portions of this software were developed by Glen Barber
 # under sponsorship from the FreeBSD Foundation.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
 # Common subroutines used to build arm/armv6 images.
 #
 # $FreeBSD$
 #
 
 cleanup() {
 	if [ -c "${DESTDIR}/dev/null" ]; then
 		umount_loop ${DESTDIR}/dev 2>/dev/null
 	fi
 	umount_loop ${DESTDIR}
 	if [ ! -z "${mddev}" ]; then
 		mdconfig -d -u ${mddev}
 	fi
 
 	return 0
 }
 
 umount_loop() {
 	DIR=$1
 	i=0
 	sync
 	while ! umount ${DIR}; do
 		i=$(( $i + 1 ))
 		if [ $i -ge 10 ]; then
 			# This should never happen.  But, it has happened.
 			echo "Cannot umount(8) ${DIR}"
 			echo "Something has gone horribly wrong."
 			return 1
 		fi
 		sleep 1
 	done
 
 	return 0
 }
 
 arm_create_disk() {
 	# Create the target raw file and temporary work directory.
 	chroot ${CHROOTDIR} gpart create -s ${PART_SCHEME} ${mddev}
 	chroot ${CHROOTDIR} gpart add -t '!12' -a 63 -s ${FAT_SIZE} ${mddev}
 	chroot ${CHROOTDIR} gpart set -a active -i 1 ${mddev}
 	chroot ${CHROOTDIR} newfs_msdos -L msdosboot -F ${FAT_TYPE} /dev/${mddev}s1
 	chroot ${CHROOTDIR} gpart add -t freebsd ${mddev}
 	chroot ${CHROOTDIR} gpart create -s bsd ${mddev}s2
 	chroot ${CHROOTDIR} gpart add -t freebsd-ufs -a 64k /dev/${mddev}s2
 	chroot ${CHROOTDIR} newfs -U -L rootfs /dev/${mddev}s2a
 
 	return 0
 }
 
 arm_create_user() {
 	# Create a default user account 'freebsd' with the password 'freebsd',
 	# and set the default password for the 'root' user to 'root'.
 	chroot ${CHROOTDIR} /usr/sbin/pw -R ${DESTDIR} \
 		groupadd freebsd -g 1001
 	chroot ${CHROOTDIR} mkdir -p ${DESTDIR}/home/freebsd
 	chroot ${CHROOTDIR} /usr/sbin/pw -R ${DESTDIR} \
 		useradd freebsd \
 		-m -M 0755 -w yes -n freebsd -u 1001 -g 1001 -G 0 \
 		-c 'FreeBSD User' -d '/home/freebsd' -s '/bin/csh'
 	chroot ${CHROOTDIR} /usr/sbin/pw -R ${DESTDIR} \
 		usermod root -w yes
-	chroot ${CHROOTDIR} ln -s /home ${DESTDIR}/usr/home
 
 	return 0
 }
 
 arm_install_base() {
 	chroot ${CHROOTDIR} mount /dev/${mddev}s2a ${DESTDIR}
 	eval chroot ${CHROOTDIR} make -C ${WORLDDIR} \
 		TARGET=${EMBEDDED_TARGET} \
 		TARGET_ARCH=${EMBEDDED_TARGET_ARCH} \
 		DESTDIR=${DESTDIR} KERNCONF=${KERNEL} \
 		installworld installkernel distribution
 	chroot ${CHROOTDIR} mkdir -p ${DESTDIR}/boot/msdos
 
 	arm_create_user
 
 	echo '# Custom /etc/fstab for FreeBSD embedded images' \
 		> ${CHROOTDIR}/${DESTDIR}/etc/fstab
 	echo "/dev/ufs/rootfs   /       ufs     rw      1       1" \
 		>> ${CHROOTDIR}/${DESTDIR}/etc/fstab
 	echo "/dev/msdosfs/MSDOSBOOT /boot/msdos msdosfs rw,noatime 0 0" \
 		>> ${CHROOTDIR}/${DESTDIR}/etc/fstab
 	echo "tmpfs /tmp tmpfs rw,mode=1777,size=50m 0 0" \
 		>> ${CHROOTDIR}/${DESTDIR}/etc/fstab
 
 	local hostname
 	hostname="$(echo ${KERNEL} | tr '[:upper:]' '[:lower:]')"
 	echo "hostname=\"${hostname}\"" > ${CHROOTDIR}/${DESTDIR}/etc/rc.conf
 	echo 'ifconfig_DEFAULT="DHCP"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf
 	echo 'sshd_enable="YES"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf
 	echo 'sendmail_enable="NONE"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf
 	echo 'sendmail_submit_enable="NO"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf
 	echo 'sendmail_outbound_enable="NO"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf
 	echo 'sendmail_msp_queue_enable="NO"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf
 	echo 'growfs_enable="YES"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf
 
 	sync
 	umount_loop ${CHROOTDIR}/${DESTDIR}
 
 	return 0
 }
 
 arm_install_uboot() {
 	# Override in the arm/KERNEL.conf file.
 
 	return 0
 }
Index: user/alc/PQ_LAUNDRY/share/mk/bsd.compiler.mk
===================================================================
--- user/alc/PQ_LAUNDRY/share/mk/bsd.compiler.mk	(revision 307895)
+++ user/alc/PQ_LAUNDRY/share/mk/bsd.compiler.mk	(revision 307896)
@@ -1,197 +1,197 @@
 # $FreeBSD$
 
 # Setup variables for the compiler
 #
 # COMPILER_TYPE is the major type of compiler. Currently gcc and clang support
 # automatic detection. Other compiler types can be shoe-horned in, but require
 # explicit setting of the compiler type. The compiler type can also be set
 # explicitly if, say, you install gcc as clang...
 #
 # COMPILER_VERSION is a numeric constant equal to:
 #     major * 10000 + minor * 100 + tiny
 # It too can be overriden on the command line. When testing it, be sure to
 # make sure that you are limiting the test to a specific compiler. Testing
 # against 30300 for gcc likely isn't  what you wanted (since versions of gcc
 # prior to 4.2 likely have no prayer of working).
 #
 # COMPILER_FREEBSD_VERSION is the compiler's __FreeBSD_cc_version value.
 #
 # COMPILER_FEATURES will contain one or more of the following, based on
 # compiler support for that feature:
 #
 # - c++11 : supports full (or nearly full) C++11 programming environment.
 #
 # These variables with an X_ prefix will also be provided if XCC is set.
 #
 # This file may be included multiple times, but only has effect the first time.
 #
 
 .if !target(__<bsd.compiler.mk>__)
 __<bsd.compiler.mk>__:
 
 .include <bsd.opts.mk>
 
 # Handle ccache after CC is determined, but not if CC/CXX are already
 # overridden with a manual setup.
 .if ${MK_CCACHE_BUILD:Uno} == "yes" && \
     !make(showconfig) && \
     (${CC:M*ccache/world/*} == "" || ${CXX:M*ccache/world/*} == "")
 # CC is always prepended with the ccache wrapper rather than modifying
 # PATH since it is more clear that ccache is used and avoids wasting time
 # for mkdep/linking/asm builds.
 LOCALBASE?=		/usr/local
 CCACHE_WRAPPER_PATH?=	${LOCALBASE}/libexec/ccache
 CCACHE_BIN?=		${LOCALBASE}/bin/ccache
 .if exists(${CCACHE_BIN})
 # Export to ensure sub-makes can filter it out for mkdep/linking and
 # to chain down into kernel build which won't include this file.
 .export CCACHE_BIN
 # Expand and export some variables so they may be based on make vars.
 # This allows doing something like the following in the environment:
 # CCACHE_BASEDIR='${SRCTOP:H}' MAKEOBJDIRPREFIX='${SRCTOP:H}/obj'
 .for var in CCACHE_LOGFILE CCACHE_BASEDIR
 .if defined(${var})
 ${var}:=	${${var}}
 .export		${var}
 .endif
 .endfor
 # Handle bootstrapped compiler changes properly by hashing their content
 # rather than checking mtime.  For external compilers it should be safe
 # to use the more optimal mtime check.
 # XXX: CCACHE_COMPILERCHECK= string:<compiler_version, compiler_build_rev, compiler_patch_rev, compiler_default_target, compiler_default_sysroot>
 .if ${CC:N${CCACHE_BIN}:[1]:M/*} == ""
 CCACHE_COMPILERCHECK?=	content
 .else
 CCACHE_COMPILERCHECK?=	mtime
 .endif
 .export CCACHE_COMPILERCHECK
 # Remove ccache from the PATH to prevent double calls and wasted CPP/LD time.
 PATH:=	${PATH:C,:?${CCACHE_WRAPPER_PATH}(/world)?(:$)?,,g}
 # Ensure no bogus CCACHE_PATH leaks in which might avoid the in-tree compiler.
 .if !empty(CCACHE_PATH)
 CCACHE_PATH=
 .export CCACHE_PATH
 .endif
 # Override various toolchain vars.
 .for var in CC CXX HOST_CC HOST_CXX
 .if defined(${var}) && ${${var}:M${CCACHE_BIN}} == ""
 ${var}:=	${CCACHE_BIN} ${${var}}
 .endif
 .endfor
 # GCC does not need the CCACHE_CPP2 hack enabled by default in devel/ccache.
 # The port enables it due to ccache passing preprocessed C to clang
 # which fails with -Wparentheses-equality, -Wtautological-compare, and
 # -Wself-assign on macro-expanded lines.
 .if defined(COMPILER_TYPE) && ${COMPILER_TYPE} == "gcc"
 CCACHE_NOCPP2=	1
 .export CCACHE_NOCPP2
 .endif
 # Canonicalize CCACHE_DIR for meta mode usage.
 .if !defined(CCACHE_DIR)
 CCACHE_DIR!=	${CCACHE_BIN} -p | awk '$$2 == "cache_dir" {print $$4}'
 .export CCACHE_DIR
 .endif
 .if !empty(CCACHE_DIR) && empty(.MAKE.META.IGNORE_PATHS:M${CCACHE_DIR})
 CCACHE_DIR:=	${CCACHE_DIR:tA}
 .MAKE.META.IGNORE_PATHS+= ${CCACHE_DIR}
 .export CCACHE_DIR
 .endif
 # ccache doesn't affect build output so let it slide for meta mode
 # comparisons.
 .MAKE.META.IGNORE_PATHS+= ${CCACHE_BIN}
 ccache-print-options: .PHONY
 	@${CCACHE_BIN} -p
 .endif	# exists(${CCACHE_BIN})
 .endif	# ${MK_CCACHE_BUILD} == "yes"
 
 .for cc X_ in CC $${_empty_var_} XCC X_
 .if ${cc} == "CC" || !empty(XCC)
 # Try to import COMPILER_TYPE and COMPILER_VERSION from parent make.
 # The value is only used/exported for the same environment that impacts
 # CC and COMPILER_* settings here.
 _exported_vars=	${X_}COMPILER_TYPE ${X_}COMPILER_VERSION \
 		${X_}COMPILER_FREEBSD_VERSION
 ${X_}_cc_hash=	${${cc}}${MACHINE}${PATH}
 ${X_}_cc_hash:=	${${X_}_cc_hash:hash}
 # Only import if none of the vars are set somehow else.
 _can_export=	yes
 .for var in ${_exported_vars}
 .if defined(${var})
 _can_export=	no
 .endif
 .endfor
 .if ${_can_export} == yes
 .for var in ${_exported_vars}
 .if defined(${var}.${${X_}_cc_hash})
 ${var}=	${${var}.${${X_}_cc_hash}}
 .endif
 .endfor
 .endif
 
 .if ${cc} == "CC" || (${cc} == "XCC" && ${XCC} != ${CC})
 .if ${MACHINE} == "common"
 # common is a pseudo machine for architecture independent
 # generated files - thus there is no compiler.
 ${X_}COMPILER_TYPE= none
 ${X_}COMPILER_VERSION= 0
 ${X_}COMPILER_FREEBSD_VERSION= 0
 .elif !defined(${X_}COMPILER_TYPE) || !defined(${X_}COMPILER_VERSION)
 _v!=	${${cc}} --version || echo 0.0.0
 
 .if !defined(${X_}COMPILER_TYPE)
 . if ${${cc}:T:M*gcc*}
 ${X_}COMPILER_TYPE:=	gcc
 . elif ${${cc}:T:M*clang*}
 ${X_}COMPILER_TYPE:=	clang
 . elif ${_v:Mgcc}
 ${X_}COMPILER_TYPE:=	gcc
 . elif ${_v:M\(GCC\)}
 ${X_}COMPILER_TYPE:=	gcc
-. elif ${_v:Mclang}
+. elif ${_v:Mclang} || ${_v:M(clang-*.*.*)}
 ${X_}COMPILER_TYPE:=	clang
 . else
 .error Unable to determine compiler type for ${cc}=${${cc}}.  Consider setting ${X_}COMPILER_TYPE.
 . endif
 .endif
 .if !defined(${X_}COMPILER_VERSION)
 ${X_}COMPILER_VERSION!=echo "${_v:M[1-9].[0-9]*}" | awk -F. '{print $$1 * 10000 + $$2 * 100 + $$3;}'
 .endif
 .undef _v
 .endif
 .if !defined(${X_}COMPILER_FREEBSD_VERSION)
 ${X_}COMPILER_FREEBSD_VERSION!=	{ echo "__FreeBSD_cc_version" | ${${cc}} -E - 2>/dev/null || echo __FreeBSD_cc_version; } | sed -n '$$p'
 # If we get a literal "__FreeBSD_cc_version" back then the compiler
 # is a non-FreeBSD build that doesn't support it or some other error
 # occurred.
 .if ${${X_}COMPILER_FREEBSD_VERSION} == "__FreeBSD_cc_version"
 ${X_}COMPILER_FREEBSD_VERSION=	unknown
 .endif
 .endif
 
 .if ${${X_}COMPILER_TYPE} == "clang" || \
 	(${${X_}COMPILER_TYPE} == "gcc" && ${${X_}COMPILER_VERSION} >= 40800)
 ${X_}COMPILER_FEATURES=	c++11
 .else
 ${X_}COMPILER_FEATURES=
 .endif
 
 .else
 # Use CC's values
 X_COMPILER_TYPE=	${COMPILER_TYPE}
 X_COMPILER_VERSION=	${COMPILER_VERSION}
 X_COMPILER_FREEBSD_VERSION=	${COMPILER_FREEBSD_VERSION}
 X_COMPILER_FEATURES=	${COMPILER_FEATURES}
 .endif	# ${cc} == "CC" || (${cc} == "XCC" && ${XCC} != ${CC})
 
 # Export the values so sub-makes don't have to look them up again, using the
 # hash key computed above.
 .for var in ${_exported_vars}
 ${var}.${${X_}_cc_hash}:=	${${var}}
 .export-env ${var}.${${X_}_cc_hash}
 .undef ${var}.${${X_}_cc_hash}
 .endfor
 
 .endif	# ${cc} == "CC" || !empty(XCC)
 .endfor	# .for cc in CC XCC
 
 .endif	# !target(__<bsd.compiler.mk>__)
Index: user/alc/PQ_LAUNDRY/sys/amd64/amd64/trap.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/amd64/amd64/trap.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/amd64/amd64/trap.c	(revision 307896)
@@ -1,944 +1,939 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * AMD64 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , page_fault, all);
 PMC_SOFT_DEFINE( , , page_fault, read);
 PMC_SOFT_DEFINE( , , page_fault, write);
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/stack.h>
 #include <machine/tss.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 #endif
 
 extern void __noinline trap(struct trapframe *frame);
 extern void trap_check(struct trapframe *frame);
 extern void syscall(struct trapframe *frame);
 void dblfault_handler(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 
 #define MAX_TRAP_MSG		32
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 	"",					/* 31 unused (reserved) */
 	"DTrace pid return trap",		/* 32 T_DTRACE_RET */
 };
 
-static int panic_on_nmi = 1;
-SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
-	&panic_on_nmi, 0, "Panic on NMI");
 static int prot_fault_translation;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
     &prot_fault_translation, 0,
     "Select signal to deliver on protection fault");
 static int uprintf_signal;
 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN,
     &uprintf_signal, 0,
     "Print debugging information on trap signal to ctty");
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 #ifdef KDTRACE_HOOKS
 	struct reg regs;
 #endif
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 #ifdef KDB
 	register_t dr6;
 #endif
 	int i = 0, ucode = 0;
 	u_int type;
 	register_t addr = 0;
 	ksiginfo_t ksi;
 
 	PCPU_INC(cnt.v_trap);
 	type = frame->tf_trapno;
 
 #ifdef SMP
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI) {
 	         if (ipi_nmi_handler() == 0)
 	                   goto out;
 	}
 #endif /* SMP */
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		goto out;
 	}
 #endif
 
 	if (type == T_RESERVED) {
 		trap_fatal(frame, 0);
 		goto out;
 	}
 
 	if (type == T_NMI) {
 #ifdef HWPMC_HOOKS
 		/*
 		 * CPU PMCs interrupt using an NMI.  If the PMC module is
 		 * active, pass the 'rip' value to the PMC module's interrupt
 		 * handler.  A non-zero return value from the handler means that
 		 * the NMI was consumed by it and we can return immediately.
 		 */
 		if (pmc_intr != NULL &&
 		    (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
 			goto out;
 #endif
 
 #ifdef STACK
 		if (stack_nmi_handler(frame) != 0)
 			goto out;
 #endif
 	}
 
 	if (type == T_MCHK) {
 		mca_intr();
 		goto out;
 	}
 
 	if ((frame->tf_rflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (TRAPF_USERMODE(frame))
 			uprintf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curthread->td_name, type);
 		else if (type != T_NMI && type != T_BPTFLT &&
 		    type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 
 			/*
 			 * We shouldn't enable interrupts while holding a
 			 * spin lock.
 			 */
 			if (td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 
 	if (TRAPF_USERMODE(frame)) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_rip;
 		if (td->td_cowgen != p->p_cowgen)
 			thread_cow_update(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			i = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
 #ifdef KDTRACE_HOOKS
 			if (type == T_BPTFLT) {
 				fill_frame_regs(frame, &regs);
 				if (dtrace_pid_probe_ptr != NULL &&
 				    dtrace_pid_probe_ptr(&regs) == 0)
 					goto out;
 			}
 #endif
 			frame->tf_rflags &= ~PSL_T;
 			i = SIGTRAP;
 			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = fputrap_x87();
 			if (ucode == -1)
 				goto userout;
 			i = SIGFPE;
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_STKFLT:		/* stack fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			i = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_ALIGNFLT:
 			i = SIGBUS;
 			ucode = BUS_ADRALN;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			/*
 			 * Emulator can take care about this trap?
 			 */
 			if (*p->p_sysent->sv_trap != NULL &&
 			    (*p->p_sysent->sv_trap)(td) == 0)
 				goto userout;
 
 			addr = frame->tf_addr;
 			i = trap_pfault(frame, TRUE);
 			if (i == -1)
 				goto userout;
 			if (i == 0)
 				goto user;
 
 			if (i == SIGSEGV)
 				ucode = SEGV_MAPERR;
 			else {
 				if (prot_fault_translation == 0) {
 					/*
 					 * Autodetect.
 					 * This check also covers the images
 					 * without the ABI-tag ELF note.
 					 */
 					if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
 					    && p->p_osrel >= P_OSREL_SIGSEGV) {
 						i = SIGSEGV;
 						ucode = SEGV_ACCERR;
 					} else {
 						i = SIGBUS;
 						ucode = BUS_PAGE_FAULT;
 					}
 				} else if (prot_fault_translation == 1) {
 					/*
 					 * Always compat mode.
 					 */
 					i = SIGBUS;
 					ucode = BUS_PAGE_FAULT;
 				} else {
 					/*
 					 * Always SIGSEGV mode.
 					 */
 					i = SIGSEGV;
 					ucode = SEGV_ACCERR;
 				}
 			}
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
-			nmi_handle_intr(type, frame, true);
+			nmi_handle_intr(type, frame);
 			break;
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 			/* transparent fault (due to context switch "late") */
 			KASSERT(PCB_USER_FPU(td->td_pcb),
 			    ("kernel FPU ctx has leaked"));
 			fpudna();
 			goto userout;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			i = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = fputrap_sse();
 			if (ucode == -1)
 				goto userout;
 			i = SIGFPE;
 			break;
 #ifdef KDTRACE_HOOKS
 		case T_DTRACE_RET:
 			enable_intr();
 			fill_frame_regs(frame, &regs);
 			if (dtrace_return_probe_ptr != NULL &&
 			    dtrace_return_probe_ptr(&regs) == 0)
 				goto out;
 			break;
 #endif
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE);
 			goto out;
 
 		case T_DNA:
 			if (PCB_USER_FPU(td->td_pcb))
 				panic("Unregistered use of FPU in kernel");
 			fpudna();
 			goto out;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 		case T_XMMFLT:		/* SIMD floating-point exception */
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			/*
 			 * For now, supporting kernel handler
 			 * registration for FPU traps is overkill.
 			 */
 			trap_fatal(frame, 0);
 			goto out;
 
 		case T_STKFLT:		/* stack fault */
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %rip's and %rsp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame->tf_rip == (long)doreti_iret) {
 				frame->tf_rip = (long)doreti_iret_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_ds) {
 				frame->tf_rip = (long)ds_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_es) {
 				frame->tf_rip = (long)es_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_fs) {
 				frame->tf_rip = (long)fs_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_gs) {
 				frame->tf_rip = (long)gs_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_gsbase) {
 				frame->tf_rip = (long)gsbase_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_fsbase) {
 				frame->tf_rip = (long)fsbase_load_fault;
 				goto out;
 			}
 			if (curpcb->pcb_onfault != NULL) {
 				frame->tf_rip = (long)curpcb->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_rflags & PSL_NT) {
 				frame->tf_rflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			if (user_dbreg_trap()) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & ~0xf);
 				goto out;
 			}
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			/* XXX %dr6 is not quite reentrant. */
 			dr6 = rdr6();
 			load_dr6(dr6 & ~0x4000);
 			if (kdb_trap(type, dr6, frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
-			if (nmi_handle_intr(type, frame, false) ||
-			    !panic_on_nmi)
-				goto out;
-			/* FALLTHROUGH */
+			nmi_handle_intr(type, frame);
+			goto out;
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(frame, 0);
 		goto out;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = i;
 	ksi.ksi_code = ucode;
 	ksi.ksi_trapno = type;
 	ksi.ksi_addr = (void *)addr;
 	if (uprintf_signal) {
 		uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
 		    "addr 0x%lx rsp 0x%lx rip 0x%lx "
 		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
 		    p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
 		    frame->tf_rsp, frame->tf_rip,
 		    fubyte((void *)(frame->tf_rip + 0)),
 		    fubyte((void *)(frame->tf_rip + 1)),
 		    fubyte((void *)(frame->tf_rip + 2)),
 		    fubyte((void *)(frame->tf_rip + 3)),
 		    fubyte((void *)(frame->tf_rip + 4)),
 		    fubyte((void *)(frame->tf_rip + 5)),
 		    fubyte((void *)(frame->tf_rip + 6)),
 		    fubyte((void *)(frame->tf_rip + 7)));
 	}
 	KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled"));
 	trapsignal(td, &ksi);
 
 user:
 	userret(td, frame);
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("Return from trap with kernel FPU ctx leaked"));
 userout:
 out:
 	return;
 }
 
 /*
  * Ensure that we ignore any DTrace-induced faults. This function cannot
  * be instrumented, so it cannot generate such faults itself.
  */
 void
 trap_check(struct trapframe *frame)
 {
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_trap_func != NULL &&
 	    (*dtrace_trap_func)(frame, frame->tf_trapno) != 0)
 		return;
 #endif
 	trap(frame);
 }
 
 static int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	vm_offset_t eva = frame->tf_addr;
 
 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
 		/*
 		 * Due to both processor errata and lazy TLB invalidation when
 		 * access restrictions are removed from virtual pages, memory
 		 * accesses that are allowed by the physical mapping layer may
 		 * nonetheless cause one spurious page fault per virtual page. 
 		 * When the thread is executing a "no faulting" section that
 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
 		 * every page fault is treated as a spurious page fault,
 		 * unless it accesses the same virtual address as the most
 		 * recent page fault within the same "no faulting" section.
 		 */
 		if (td->td_md.md_spurflt_addr != eva ||
 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
 			/*
 			 * Do nothing to the TLB.  A stale TLB entry is
 			 * flushed automatically by a page fault.
 			 */
 			td->td_md.md_spurflt_addr = eva;
 			td->td_pflags &= ~TDP_RESETSPUR;
 			return (0);
 		}
 	} else {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 	va = trunc_page(eva);
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		map = &p->p_vmspace->vm_map;
 
 		/*
 		 * When accessing a usermode address, kernel must be
 		 * ready to accept the page fault, and provide a
 		 * handling routine.  Since accessing the address
 		 * without the handler is a bug, do not try to handle
 		 * it normally, and panic immediately.
 		 */
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
 		    curpcb->pcb_onfault == NULL)) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 
 	/*
 	 * If the trap was caused by errant bits in the PTE then panic.
 	 */
 	if (frame->tf_err & PGEX_RSV) {
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 	else
 		ftype = VM_PROT_READ;
 
 	/* Fault in the page. */
 	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	if (rv == KERN_SUCCESS) {
 #ifdef HWPMC_HOOKS
 		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
 			if (ftype == VM_PROT_READ)
 				PMC_SOFT_CALL_TF( , , page_fault, read,
 				    frame);
 			else
 				PMC_SOFT_CALL_TF( , , page_fault, write,
 				    frame);
 		}
 #endif
 		return (0);
 	}
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    curpcb->pcb_onfault != NULL) {
 			frame->tf_rip = (long)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, ss;
 	u_int type;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
 	    &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		msg = trap_msg[type];
 	else
 		msg = "UNKNOWN";
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
 	    TRAPF_USERMODE(frame) ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
 		printf("fault code		= %s %s %s%s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_I ? "instruction" : "data",
 			code & PGEX_RSV ? " rsv" : "",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
 	       frame->tf_cs & 0xffff, frame->tf_rip);
 	ss = frame->tf_ss & 0xffff;
 	printf("stack pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rsp);
 	printf("frame pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rbp);
 	printf("code segment		= base 0x%lx, limit 0x%lx, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, long %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_rflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_rflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_rflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_rflags & PSL_RF)
 		printf("resume, ");
 	printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
 	printf("current process		= %d (%s)\n",
 	    curproc->p_pid, curthread->td_name);
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active)
 		if (kdb_trap(type, 0, frame))
 			return;
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  */
 void
 dblfault_handler(struct trapframe *frame)
 {
 #ifdef KDTRACE_HOOKS
 	if (dtrace_doubletrap_func != NULL)
 		(*dtrace_doubletrap_func)();
 #endif
 	printf("\nFatal double fault\n");
 	printf("rip = 0x%lx\n", frame->tf_rip);
 	printf("rsp = 0x%lx\n", frame->tf_rsp);
 	printf("rbp = 0x%lx\n", frame->tf_rbp);
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 int
 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	register_t *argp;
 	caddr_t params;
 	int reg, regcnt, error;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 	reg = 0;
 	regcnt = 6;
 
 	params = (caddr_t)frame->tf_rsp + sizeof(register_t);
 	sa->code = frame->tf_rax;
 
 	if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
 		sa->code = frame->tf_rdi;
 		reg++;
 		regcnt--;
 	}
  	if (p->p_sysent->sv_mask)
  		sa->code &= p->p_sysent->sv_mask;
 
  	if (sa->code >= p->p_sysent->sv_size)
  		sa->callp = &p->p_sysent->sv_table[0];
   	else
  		sa->callp = &p->p_sysent->sv_table[sa->code];
 
 	sa->narg = sa->callp->sy_narg;
 	KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]),
 	    ("Too many syscall arguments!"));
 	error = 0;
 	argp = &frame->tf_rdi;
 	argp += reg;
 	bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt);
 	if (sa->narg > regcnt) {
 		KASSERT(params != NULL, ("copyin args with no params!"));
 		error = copyin(params, &sa->args[regcnt],
 	    	    (sa->narg - regcnt) * sizeof(sa->args[0]));
 	}
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_rdx;
 	}
 
 	return (error);
 }
 
 #include "../../kern/subr_syscall.c"
 
 /*
  * System call handler for native binaries.  The trap frame is already
  * set up by the assembler trampoline and a pointer to it is saved in
  * td_frame.
  */
 void
 amd64_syscall(struct thread *td, int traced)
 {
 	struct syscall_args sa;
 	int error;
 	ksiginfo_t ksi;
 
 #ifdef DIAGNOSTIC
 	if (!TRAPF_USERMODE(td->td_frame)) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 	error = syscallenter(td, &sa);
 
 	/*
 	 * Traced syscall.
 	 */
 	if (__predict_false(traced)) {
 		td->td_frame->tf_rflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)td->td_frame->tf_rip;
 		trapsignal(td, &ksi);
 	}
 
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("System call %s returing with kernel FPU ctx leaked",
 	     syscallname(td->td_proc, sa.code)));
 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 	    ("System call %s returning with mangled pcb_save",
 	     syscallname(td->td_proc, sa.code)));
 	KASSERT(td->td_md.md_invl_gen.gen == 0,
 	    ("System call %s returning with leaked invl_gen %lu",
 	    syscallname(td->td_proc, sa.code), td->td_md.md_invl_gen.gen));
 
 
 	syscallret(td, error, &sa);
 
 	/*
 	 * If the user-supplied value of %rip is not a canonical
 	 * address, then some CPUs will trigger a ring 0 #GP during
 	 * the sysret instruction.  However, the fault handler would
 	 * execute in ring 0 with the user's %gs and %rsp which would
 	 * not be safe.  Instead, use the full return path which
 	 * catches the problem safely.
 	 */
 	if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS)
 		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_rsb.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_rsb.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_rsb.c	(revision 307896)
@@ -1,479 +1,498 @@
 /*-
  * Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
- * Allwinner RSB (Reduced Serial Bus)
+ * Allwinner RSB (Reduced Serial Bus) and P2WI (Push-Pull Two Wire Interface)
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <dev/iicbus/iiconf.h>
 #include <dev/iicbus/iicbus.h>
 
 #include <dev/extres/clk/clk.h>
 #include <dev/extres/hwreset/hwreset.h>
 
 #include "iicbus_if.h"
 
 #define	RSB_CTRL		0x00
 #define	 START_TRANS		(1 << 7)
 #define	 GLOBAL_INT_ENB		(1 << 1)
 #define	 SOFT_RESET		(1 << 0)
 #define	RSB_CCR		0x04
 #define	RSB_INTE		0x08
 #define	RSB_INTS		0x0c
 #define	 INT_TRANS_ERR_ID(x)	(((x) >> 8) & 0xf)
 #define	 INT_LOAD_BSY		(1 << 2)
 #define	 INT_TRANS_ERR		(1 << 1)
 #define	 INT_TRANS_OVER		(1 << 0)
 #define	 INT_MASK		(INT_LOAD_BSY|INT_TRANS_ERR|INT_TRANS_OVER)
 #define	RSB_DADDR0		0x10
 #define	RSB_DADDR1		0x14
 #define	RSB_DLEN		0x18
 #define	 DLEN_READ		(1 << 4)
 #define	RSB_DATA0		0x1c
 #define	RSB_DATA1		0x20
 #define	RSB_CMD			0x2c
 #define	 CMD_SRTA		0xe8
 #define	 CMD_RD8		0x8b
 #define	 CMD_RD16		0x9c
 #define	 CMD_RD32		0xa6
 #define	 CMD_WR8		0x4e
 #define	 CMD_WR16		0x59
 #define	 CMD_WR32		0x63
 #define	RSB_DAR			0x30
 #define	 DAR_RTA		(0xff << 16)
 #define	 DAR_RTA_SHIFT		16
 #define	 DAR_DA			(0xffff << 0)
 #define	 DAR_DA_SHIFT		0
 
 #define	RSB_MAXLEN		8
 #define	RSB_RESET_RETRY		100
 #define	RSB_I2C_TIMEOUT		hz
 
 #define	RSB_ADDR_PMIC_PRIMARY	0x3a3
 #define	RSB_ADDR_PMIC_SECONDARY	0x745
 #define	RSB_ADDR_PERIPH_IC	0xe89
 
+#define	A31_P2WI	1
+#define	A23_RSB		2
+
 static struct ofw_compat_data compat_data[] = {
-	{ "allwinner,sun8i-a23-rsb",		1 },
+	{ "allwinner,sun6i-a31-p2wi",		A31_P2WI },
+	{ "allwinner,sun8i-a23-rsb",		A23_RSB },
 	{ NULL,					0 }
 };
 
 static struct resource_spec rsb_spec[] = {
 	{ SYS_RES_MEMORY,	0,	RF_ACTIVE },
 	{ -1, 0 }
 };
 
 /*
  * Device address to Run-time address mappings.
  *
  * Run-time address (RTA) is an 8-bit value used to address the device during
  * a read or write transaction. The following are valid RTAs:
  *  0x17 0x2d 0x3a 0x4e 0x59 0x63 0x74 0x8b 0x9c 0xa6 0xb1 0xc5 0xd2 0xe8 0xff
  *
  * Allwinner uses RTA 0x2d for the primary PMIC, 0x3a for the secondary PMIC,
  * and 0x4e for the peripheral IC (where applicable).
  */
 static const struct {
 	uint16_t	addr;
 	uint8_t		rta;
 } rsb_rtamap[] = {
 	{ .addr = RSB_ADDR_PMIC_PRIMARY,	.rta = 0x2d },
 	{ .addr = RSB_ADDR_PMIC_SECONDARY,	.rta = 0x3a },
 	{ .addr = RSB_ADDR_PERIPH_IC,		.rta = 0x4e },
 	{ .addr = 0,				.rta = 0 }
 };
 
 struct rsb_softc {
 	struct resource	*res;
 	struct mtx	mtx;
 	clk_t		clk;
 	hwreset_t	rst;
 	device_t	iicbus;
 	int		busy;
 	uint32_t	status;
 	uint16_t	cur_addr;
+	int		type;
 
 	struct iic_msg	*msg;
 };
 
 #define	RSB_LOCK(sc)			mtx_lock(&(sc)->mtx)
 #define	RSB_UNLOCK(sc)			mtx_unlock(&(sc)->mtx)
 #define	RSB_ASSERT_LOCKED(sc)		mtx_assert(&(sc)->mtx, MA_OWNED)
 #define	RSB_READ(sc, reg)		bus_read_4((sc)->res, (reg))
 #define	RSB_WRITE(sc, reg, val)	bus_write_4((sc)->res, (reg), (val))
 
 static phandle_t
 rsb_get_node(device_t bus, device_t dev)
 {
 	return (ofw_bus_get_node(bus));
 }
 
 static int
 rsb_reset(device_t dev, u_char speed, u_char addr, u_char *oldaddr)
 {
 	struct rsb_softc *sc;
 	int retry;
 
 	sc = device_get_softc(dev);
 
 	RSB_LOCK(sc);
 
 	/* Write soft-reset bit and wait for it to self-clear. */
 	RSB_WRITE(sc, RSB_CTRL, SOFT_RESET);
 	for (retry = RSB_RESET_RETRY; retry > 0; retry--)
 		if ((RSB_READ(sc, RSB_CTRL) & SOFT_RESET) == 0)
 			break;
 
 	RSB_UNLOCK(sc);
 
 	if (retry == 0) {
 		device_printf(dev, "soft reset timeout\n");
 		return (ETIMEDOUT);
 	}
 
 	return (IIC_ENOADDR);
 }
 
 static uint32_t
 rsb_encode(const uint8_t *buf, u_int len, u_int off)
 {
 	uint32_t val;
 	u_int n;
 
 	val = 0;
 	for (n = off; n < MIN(len, 4 + off); n++)
 		val |= ((uint32_t)buf[n] << ((n - off) * NBBY));
 
 	return val;
 }
 
 static void
 rsb_decode(const uint32_t val, uint8_t *buf, u_int len, u_int off)
 {
 	u_int n;
 
 	for (n = off; n < MIN(len, 4 + off); n++)
 		buf[n] = (val >> ((n - off) * NBBY)) & 0xff;
 }
 
 static int
 rsb_start(device_t dev)
 {
 	struct rsb_softc *sc;
 	int error, retry;
 
 	sc = device_get_softc(dev);
 
 	RSB_ASSERT_LOCKED(sc);
 
 	/* Start the transfer */
 	RSB_WRITE(sc, RSB_CTRL, GLOBAL_INT_ENB | START_TRANS);
 
 	/* Wait for transfer to complete */
 	error = ETIMEDOUT;
 	for (retry = RSB_I2C_TIMEOUT; retry > 0; retry--) {
 		sc->status |= RSB_READ(sc, RSB_INTS);
 		if ((sc->status & INT_TRANS_OVER) != 0) {
 			error = 0;
 			break;
 		}
 		DELAY((1000 * hz) / RSB_I2C_TIMEOUT);
 	}
 	if (error == 0 && (sc->status & INT_TRANS_OVER) == 0) {
 		device_printf(dev, "transfer error, status 0x%08x\n",
 		    sc->status);
 		error = EIO;
 	}
 
 	return (error);
 
 }
 
 static int
 rsb_set_rta(device_t dev, uint16_t addr)
 {
 	struct rsb_softc *sc;
 	uint8_t rta;
 	int i;
 
 	sc = device_get_softc(dev);
 
 	RSB_ASSERT_LOCKED(sc);
 
 	/* Lookup run-time address for given device address */
 	for (rta = 0, i = 0; rsb_rtamap[i].rta != 0; i++)
 		if (rsb_rtamap[i].addr == addr) {
 			rta = rsb_rtamap[i].rta;
 			break;
 		}
 	if (rta == 0) {
 		device_printf(dev, "RTA not known for address %#x\n", addr);
 		return (ENXIO);
 	}
 
 	/* Set run-time address */
 	RSB_WRITE(sc, RSB_INTS, RSB_READ(sc, RSB_INTS));
 	RSB_WRITE(sc, RSB_DAR, (addr << DAR_DA_SHIFT) | (rta << DAR_RTA_SHIFT));
 	RSB_WRITE(sc, RSB_CMD, CMD_SRTA);
 
 	return (rsb_start(dev));
 }
 
 static int
 rsb_transfer(device_t dev, struct iic_msg *msgs, uint32_t nmsgs)
 {
 	struct rsb_softc *sc;
 	uint32_t daddr[2], data[2], dlen;
 	uint16_t device_addr;
 	uint8_t cmd;
 	int error;
 
 	sc = device_get_softc(dev);
 
 	/*
-	 * RSB is not really an I2C or SMBus controller, so there are some
-	 * restrictions imposed by the driver.
+	 * P2WI and RSB are not really I2C or SMBus controllers, so there are
+	 * some restrictions imposed by the driver.
 	 *
 	 * Transfers must contain exactly two messages. The first is always
 	 * a write, containing a single data byte offset. Data will either
 	 * be read from or written to the corresponding data byte in the
 	 * second message. The slave address in both messages must be the
 	 * same.
 	 */
 	if (nmsgs != 2 || (msgs[0].flags & IIC_M_RD) == IIC_M_RD ||
 	    (msgs[0].slave >> 1) != (msgs[1].slave >> 1) ||
 	    msgs[0].len != 1 || msgs[1].len > RSB_MAXLEN)
 		return (EINVAL);
 
-	/* The controller can read or write 1, 2, or 4 bytes at a time. */
-	if ((msgs[1].flags & IIC_M_RD) != 0) {
-		switch (msgs[1].len) {
-		case 1:
-			cmd = CMD_RD8;
-			break;
-		case 2:
-			cmd = CMD_RD16;
-			break;
-		case 4:
-			cmd = CMD_RD32;
-			break;
-		default:
-			return (EINVAL);
+	/* The RSB controller can read or write 1, 2, or 4 bytes at a time. */
+	if (sc->type == A23_RSB) {
+		if ((msgs[1].flags & IIC_M_RD) != 0) {
+			switch (msgs[1].len) {
+			case 1:
+				cmd = CMD_RD8;
+				break;
+			case 2:
+				cmd = CMD_RD16;
+				break;
+			case 4:
+				cmd = CMD_RD32;
+				break;
+			default:
+				return (EINVAL);
+			}
+		} else {
+			switch (msgs[1].len) {
+			case 1:
+				cmd = CMD_WR8;
+				break;
+			case 2:
+				cmd = CMD_WR16;
+				break;
+			case 4:
+				cmd = CMD_WR32;
+				break;
+			default:
+				return (EINVAL);
+			}
 		}
-	} else {
-		switch (msgs[1].len) {
-		case 1:
-			cmd = CMD_WR8;
-			break;
-		case 2:
-			cmd = CMD_WR16;
-			break;
-		case 4:	
-			cmd = CMD_WR32;
-			break;
-		default:
-			return (EINVAL);
-		}
 	}
 
 	RSB_LOCK(sc);
 	while (sc->busy)
 		mtx_sleep(sc, &sc->mtx, 0, "i2cbuswait", 0);
 	sc->busy = 1;
 	sc->status = 0;
 
 	/* Select current run-time address if necessary */
-	device_addr = msgs[0].slave >> 1;
-	if (sc->cur_addr != device_addr) {
-		error = rsb_set_rta(dev, device_addr);
-		if (error != 0)
-			goto done;
-		sc->cur_addr = device_addr;
-		sc->status = 0;
+	if (sc->type == A23_RSB) {
+		device_addr = msgs[0].slave >> 1;
+		if (sc->cur_addr != device_addr) {
+			error = rsb_set_rta(dev, device_addr);
+			if (error != 0)
+				goto done;
+			sc->cur_addr = device_addr;
+			sc->status = 0;
+		}
 	}
 
 	/* Clear interrupt status */
 	RSB_WRITE(sc, RSB_INTS, RSB_READ(sc, RSB_INTS));
 
 	/* Program data access address registers */
 	daddr[0] = rsb_encode(msgs[0].buf, msgs[0].len, 0);
 	RSB_WRITE(sc, RSB_DADDR0, daddr[0]);
 
 	/* Write data */
 	if ((msgs[1].flags & IIC_M_RD) == 0) {
 		data[0] = rsb_encode(msgs[1].buf, msgs[1].len, 0);
 		RSB_WRITE(sc, RSB_DATA0, data[0]);
 	}
 
-	/* Set command type */
-	RSB_WRITE(sc, RSB_CMD, cmd);
+	/* Set command type for RSB */
+	if (sc->type == A23_RSB)
+		RSB_WRITE(sc, RSB_CMD, cmd);
 
 	/* Program data length register and transfer direction */
 	dlen = msgs[0].len - 1;
 	if ((msgs[1].flags & IIC_M_RD) == IIC_M_RD)
 		dlen |= DLEN_READ;
 	RSB_WRITE(sc, RSB_DLEN, dlen);
 
 	/* Start transfer */
 	error = rsb_start(dev);
 	if (error != 0)
 		goto done;
 
 	/* Read data */
 	if ((msgs[1].flags & IIC_M_RD) == IIC_M_RD) {
 		data[0] = RSB_READ(sc, RSB_DATA0);
 		rsb_decode(data[0], msgs[1].buf, msgs[1].len, 0);
 	}
 
 done:
 	sc->msg = NULL;
 	sc->busy = 0;
 	wakeup(sc);
 	RSB_UNLOCK(sc);
 
 	return (error);
 }
 
 static int
 rsb_probe(device_t dev)
 {
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
-	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
+	switch (ofw_bus_search_compatible(dev, compat_data)->ocd_data) {
+	case A23_RSB:
+		device_set_desc(dev, "Allwinner RSB");
+		break;
+	case A31_P2WI:
+		device_set_desc(dev, "Allwinner P2WI");
+		break;
+	default:
 		return (ENXIO);
+	}
 
-	device_set_desc(dev, "Allwinner RSB");
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 rsb_attach(device_t dev)
 {
 	struct rsb_softc *sc;
 	int error;
 
 	sc = device_get_softc(dev);
 	mtx_init(&sc->mtx, device_get_nameunit(dev), "rsb", MTX_DEF);
+
+	sc->type = ofw_bus_search_compatible(dev, compat_data)->ocd_data;
 
 	if (clk_get_by_ofw_index(dev, 0, 0, &sc->clk) == 0) {
 		error = clk_enable(sc->clk);
 		if (error != 0) {
 			device_printf(dev, "cannot enable clock\n");
 			goto fail;
 		}
 	}
 	if (hwreset_get_by_ofw_idx(dev, 0, 0, &sc->rst) == 0) {
 		error = hwreset_deassert(sc->rst);
 		if (error != 0) {
 			device_printf(dev, "cannot de-assert reset\n");
 			goto fail;
 		}
 	}
 
 	if (bus_alloc_resources(dev, rsb_spec, &sc->res) != 0) {
 		device_printf(dev, "cannot allocate resources for device\n");
 		error = ENXIO;
 		goto fail;
 	}
 
 	sc->iicbus = device_add_child(dev, "iicbus", -1);
 	if (sc->iicbus == NULL) {
 		device_printf(dev, "cannot add iicbus child device\n");
 		error = ENXIO;
 		goto fail;
 	}
 
 	bus_generic_attach(dev);
 
 	return (0);
 
 fail:
 	bus_release_resources(dev, rsb_spec, &sc->res);
 	if (sc->rst != NULL)
 		hwreset_release(sc->rst);
 	if (sc->clk != NULL)
 		clk_release(sc->clk);
 	mtx_destroy(&sc->mtx);
 	return (error);
 }
 
 static device_method_t rsb_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		rsb_probe),
 	DEVMETHOD(device_attach,	rsb_attach),
 
 	/* Bus interface */
 	DEVMETHOD(bus_setup_intr,	bus_generic_setup_intr),
 	DEVMETHOD(bus_teardown_intr,	bus_generic_teardown_intr),
 	DEVMETHOD(bus_alloc_resource,	bus_generic_alloc_resource),
 	DEVMETHOD(bus_release_resource,	bus_generic_release_resource),
 	DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
 	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
 	DEVMETHOD(bus_adjust_resource,	bus_generic_adjust_resource),
 	DEVMETHOD(bus_set_resource,	bus_generic_rl_set_resource),
 	DEVMETHOD(bus_get_resource,	bus_generic_rl_get_resource),
 
 	/* OFW methods */
 	DEVMETHOD(ofw_bus_get_node,	rsb_get_node),
 
 	/* iicbus interface */
 	DEVMETHOD(iicbus_callback,	iicbus_null_callback),
 	DEVMETHOD(iicbus_reset,		rsb_reset),
 	DEVMETHOD(iicbus_transfer,	rsb_transfer),
 
 	DEVMETHOD_END
 };
 
 static driver_t rsb_driver = {
 	"iichb",
 	rsb_methods,
 	sizeof(struct rsb_softc),
 };
 
 static devclass_t rsb_devclass;
 
 EARLY_DRIVER_MODULE(iicbus, rsb, iicbus_driver, iicbus_devclass, 0, 0,
     BUS_PASS_RESOURCE + BUS_PASS_ORDER_MIDDLE);
 EARLY_DRIVER_MODULE(rsb, simplebus, rsb_driver, rsb_devclass, 0, 0,
     BUS_PASS_RESOURCE + BUS_PASS_ORDER_MIDDLE);
 MODULE_VERSION(rsb, 1);
Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_thermal.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_thermal.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_thermal.c	(revision 307896)
@@ -1,561 +1,574 @@
 /*-
  * Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Allwinner thermal sensor controller
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/reboot.h>
 #include <sys/module.h>
 #include <sys/cpu.h>
+#include <sys/taskqueue.h>
 #include <machine/bus.h>
 
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <dev/extres/clk/clk.h>
 #include <dev/extres/hwreset/hwreset.h>
 
 #include <arm/allwinner/aw_sid.h>
 
 #include "cpufreq_if.h"
 
 #define	THS_CTRL0		0x00
 #define	THS_CTRL1		0x04
 #define	 ADC_CALI_EN		(1 << 17)
 #define	THS_CTRL2		0x40
 #define	 SENSOR_ACQ1_SHIFT	16
 #define	 SENSOR2_EN		(1 << 2)
 #define	 SENSOR1_EN		(1 << 1)
 #define	 SENSOR0_EN		(1 << 0)
 #define	THS_INTC		0x44
 #define	THS_INTS		0x48
 #define	 THS2_DATA_IRQ_STS	(1 << 10)
 #define	 THS1_DATA_IRQ_STS	(1 << 9)
 #define	 THS0_DATA_IRQ_STS	(1 << 8)
 #define	 SHUT_INT2_STS		(1 << 6)
 #define	 SHUT_INT1_STS		(1 << 5)
 #define	 SHUT_INT0_STS		(1 << 4)
 #define	 ALARM_INT2_STS		(1 << 2)
 #define	 ALARM_INT1_STS		(1 << 1)
 #define	 ALARM_INT0_STS		(1 << 0)
 #define	THS_ALARM0_CTRL		0x50
 #define	 ALARM_T_HOT_MASK	0xfff
 #define	 ALARM_T_HOT_SHIFT	16
 #define	 ALARM_T_HYST_MASK	0xfff
 #define	 ALARM_T_HYST_SHIFT	0
 #define	THS_SHUTDOWN0_CTRL	0x60
 #define	 SHUT_T_HOT_MASK	0xfff
 #define	 SHUT_T_HOT_SHIFT	16
 #define	THS_FILTER		0x70
 #define	THS_CALIB0		0x74
 #define	THS_CALIB1		0x78
 #define	THS_DATA0		0x80
 #define	THS_DATA1		0x84
 #define	THS_DATA2		0x88
 #define	 DATA_MASK		0xfff
 
 #define	A83T_ADC_ACQUIRE_TIME	0x17
 #define	A83T_FILTER		0x4
 #define	A83T_INTC		0x1000
 #define	A83T_TEMP_BASE		2719000
 #define	A83T_TEMP_MUL		1000
 #define	A83T_TEMP_DIV		14186
 #define	A83T_CLK_RATE		24000000
 
 #define	A64_ADC_ACQUIRE_TIME	0x190
 #define	A64_FILTER		0x6
 #define	A64_INTC		0x18000
 #define	A64_TEMP_BASE		2170000
 #define	A64_TEMP_MUL		1000
 #define	A64_TEMP_DIV		8560
 #define	A64_CLK_RATE		4000000
 
 #define	H3_ADC_ACQUIRE_TIME	0x3f
 #define	H3_FILTER		0x6
 #define	H3_INTC			0x191000
 #define	H3_TEMP_BASE		2794000
 #define	H3_TEMP_MUL		1000
 #define	H3_TEMP_DIV		-14882
 #define	H3_CLK_RATE		4000000
 
 #define	TEMP_C_TO_K		273
 #define	SENSOR_ENABLE_ALL	(SENSOR0_EN|SENSOR1_EN|SENSOR2_EN)
 #define	SHUT_INT_ALL		(SHUT_INT0_STS|SHUT_INT1_STS|SHUT_INT2_STS)
 #define	ALARM_INT_ALL		(ALARM_INT0_STS)
 
 #define	MAX_SENSORS	3
 #define	MAX_CF_LEVELS	64
 
 #define	THROTTLE_ENABLE_DEFAULT	1
 
 /* Enable thermal throttling */
 static int aw_thermal_throttle_enable = THROTTLE_ENABLE_DEFAULT;
 TUNABLE_INT("hw.aw_thermal.throttle_enable", &aw_thermal_throttle_enable);
 
 struct aw_thermal_sensor {
 	const char		*name;
 	const char		*desc;
 };
 
 struct aw_thermal_config {
 	struct aw_thermal_sensor	sensors[MAX_SENSORS];
 	int				nsensors;
 	uint64_t			clk_rate;
 	uint32_t			adc_acquire_time;
 	int				adc_cali_en;
 	uint32_t			filter;
 	uint32_t			intc;
 	int				(*to_temp)(uint32_t);
 	int				temp_base;
 	int				temp_mul;
 	int				temp_div;
 	int				calib0, calib1;
 	uint32_t			calib0_mask, calib1_mask;
 };
 
 static int
 a83t_to_temp(uint32_t val)
 {
 	return ((A83T_TEMP_BASE - (val * A83T_TEMP_MUL)) / A83T_TEMP_DIV);
 }
 
 static const struct aw_thermal_config a83t_config = {
 	.nsensors = 3,
 	.sensors = {
 		[0] = {
 			.name = "cluster0",
 			.desc = "CPU cluster 0 temperature",
 		},
 		[1] = {
 			.name = "cluster1",
 			.desc = "CPU cluster 1 temperature",
 		},
 		[2] = {
 			.name = "gpu",
 			.desc = "GPU temperature",
 		},
 	},
 	.clk_rate = A83T_CLK_RATE,
 	.adc_acquire_time = A83T_ADC_ACQUIRE_TIME,
 	.adc_cali_en = 1,
 	.filter = A83T_FILTER,
 	.intc = A83T_INTC,
 	.to_temp = a83t_to_temp,
 	.calib0 = 1,
 	.calib0_mask = 0xffffffff,
 	.calib1 = 1,
 	.calib1_mask = 0xffffffff,
 };
 
 static int
 a64_to_temp(uint32_t val)
 {
 	return ((A64_TEMP_BASE - (val * A64_TEMP_MUL)) / A64_TEMP_DIV);
 }
 
 static const struct aw_thermal_config a64_config = {
 	.nsensors = 3,
 	.sensors = {
 		[0] = {
 			.name = "cpu",
 			.desc = "CPU temperature",
 		},
 		[1] = {
 			.name = "gpu1",
 			.desc = "GPU temperature 1",
 		},
 		[2] = {
 			.name = "gpu2",
 			.desc = "GPU temperature 2",
 		},
 	},
 	.clk_rate = A64_CLK_RATE,
 	.adc_acquire_time = A64_ADC_ACQUIRE_TIME,
 	.filter = A64_FILTER,
 	.intc = A64_INTC,
 	.to_temp = a64_to_temp,
 };
 
 static int
 h3_to_temp(uint32_t val)
 {
 	return (((int)(val * H3_TEMP_MUL) - H3_TEMP_BASE) / H3_TEMP_DIV);
 }
 
 static const struct aw_thermal_config h3_config = {
 	.nsensors = 1,
 	.sensors = {
 		[0] = {
 			.name = "cpu",
 			.desc = "CPU temperature",
 		},
 	},
 	.clk_rate = H3_CLK_RATE,
 	.adc_acquire_time = H3_ADC_ACQUIRE_TIME,
 	.filter = H3_FILTER,
 	.intc = H3_INTC,
 	.to_temp = h3_to_temp,
 	.calib0 = 1,
 	.calib0_mask = 0xfff,
 };
 
 static struct ofw_compat_data compat_data[] = {
 	{ "allwinner,sun8i-a83t-ts",	(uintptr_t)&a83t_config },
 	{ "allwinner,sun8i-h3-ts",	(uintptr_t)&h3_config },
 	{ "allwinner,sun50i-a64-ts",	(uintptr_t)&a64_config },
 	{ NULL,				(uintptr_t)NULL }
 };
 
 #define	THS_CONF(d)		\
 	(void *)ofw_bus_search_compatible((d), compat_data)->ocd_data
 
 struct aw_thermal_softc {
 	device_t			dev;
 	struct resource			*res[2];
 	struct aw_thermal_config	*conf;
 
+	struct task			cf_task;
 	int				throttle;
 	int				min_freq;
 	struct cf_level			levels[MAX_CF_LEVELS];
 	eventhandler_tag		cf_pre_tag;
 };
 
 static struct resource_spec aw_thermal_spec[] = {
 	{ SYS_RES_MEMORY,	0,	RF_ACTIVE },
 	{ SYS_RES_IRQ,		0,	RF_ACTIVE },
 	{ -1, 0 }
 };
 
 #define	RD4(sc, reg)		bus_read_4((sc)->res[0], (reg))
 #define	WR4(sc, reg, val)	bus_write_4((sc)->res[0], (reg), (val))
 
 static int
 aw_thermal_init(struct aw_thermal_softc *sc)
 {
 	uint32_t calib0, calib1;
 	int error;
 
 	if (sc->conf->calib0 != 0 || sc->conf->calib1 != 0) {
 		/* Read calibration settings from SRAM */
 		error = aw_sid_read_tscalib(&calib0, &calib1);
 		if (error != 0)
 			return (error);
 
 		calib0 &= sc->conf->calib0_mask;
 		calib1 &= sc->conf->calib1_mask;
 
 		/* Write calibration settings to thermal controller */
 		if (sc->conf->calib0 != 0 && calib0 != 0)
 			WR4(sc, THS_CALIB0, calib0);
 		if (sc->conf->calib1 != 0 && calib1 != 0)
 			WR4(sc, THS_CALIB1, calib1);
 	}
 
 	/* Configure ADC acquire time (CLK_IN/(N+1)) and enable sensors */
 	WR4(sc, THS_CTRL1, ADC_CALI_EN);
 	WR4(sc, THS_CTRL0, sc->conf->adc_acquire_time);
 	WR4(sc, THS_CTRL2, sc->conf->adc_acquire_time << SENSOR_ACQ1_SHIFT);
 
 	/* Enable average filter */
 	WR4(sc, THS_FILTER, sc->conf->filter);
 
 	/* Enable interrupts */
 	WR4(sc, THS_INTS, RD4(sc, THS_INTS));
 	WR4(sc, THS_INTC, sc->conf->intc | SHUT_INT_ALL | ALARM_INT_ALL);
 
 	/* Enable sensors */
 	WR4(sc, THS_CTRL2, RD4(sc, THS_CTRL2) | SENSOR_ENABLE_ALL);
 
 	return (0);
 }
 
 static int
 aw_thermal_gettemp(struct aw_thermal_softc *sc, int sensor)
 {
 	uint32_t val;
 
 	val = RD4(sc, THS_DATA0 + (sensor * 4));
 
 	return (sc->conf->to_temp(val) + TEMP_C_TO_K);
 }
 
 static int
 aw_thermal_getshut(struct aw_thermal_softc *sc, int sensor)
 {
 	uint32_t val;
 
 	val = RD4(sc, THS_SHUTDOWN0_CTRL + (sensor * 4));
 	val = (val >> SHUT_T_HOT_SHIFT) & SHUT_T_HOT_MASK;
 
 	return (sc->conf->to_temp(val) + TEMP_C_TO_K);
 }
 
 static int
 aw_thermal_gethyst(struct aw_thermal_softc *sc, int sensor)
 {
 	uint32_t val;
 
 	val = RD4(sc, THS_ALARM0_CTRL + (sensor * 4));
 	val = (val >> ALARM_T_HYST_SHIFT) & ALARM_T_HYST_MASK;
 
 	return (sc->conf->to_temp(val) + TEMP_C_TO_K);
 }
 
 static int
 aw_thermal_getalarm(struct aw_thermal_softc *sc, int sensor)
 {
 	uint32_t val;
 
 	val = RD4(sc, THS_ALARM0_CTRL + (sensor * 4));
 	val = (val >> ALARM_T_HOT_SHIFT) & ALARM_T_HOT_MASK;
 
 	return (sc->conf->to_temp(val) + TEMP_C_TO_K);
 }
 
 static int
 aw_thermal_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct aw_thermal_softc *sc;
 	int sensor, val;
 
 	sc = arg1;
 	sensor = arg2;
 
 	val = aw_thermal_gettemp(sc, sensor);
 
 	return sysctl_handle_opaque(oidp, &val, sizeof(val), req);
 }
 
 static void
 aw_thermal_throttle(struct aw_thermal_softc *sc, int enable)
 {
 	device_t cf_dev;
 	int count, error;
 
 	if (enable == sc->throttle)
 		return;
 
 	if (enable != 0) {
 		/* Set the lowest available frequency */
 		cf_dev = devclass_get_device(devclass_find("cpufreq"), 0);
 		if (cf_dev == NULL)
 			return;
 		count = MAX_CF_LEVELS;
 		error = CPUFREQ_LEVELS(cf_dev, sc->levels, &count);
 		if (error != 0 || count == 0)
 			return;
 		sc->min_freq = sc->levels[count - 1].total_set.freq;
 		error = CPUFREQ_SET(cf_dev, &sc->levels[count - 1],
 		    CPUFREQ_PRIO_USER);
 		if (error != 0)
 			return;
 	}
 
 	sc->throttle = enable;
 }
 
 static void
+aw_thermal_cf_task(void *arg, int pending)
+{
+	struct aw_thermal_softc *sc;
+
+	sc = arg;
+
+	aw_thermal_throttle(sc, 1);
+}
+
+static void
 aw_thermal_cf_pre_change(void *arg, const struct cf_level *level, int *status)
 {
 	struct aw_thermal_softc *sc;
 	int temp_cur, temp_alarm;
 
 	sc = arg;
 
 	if (aw_thermal_throttle_enable == 0 || sc->throttle == 0 ||
 	    level->total_set.freq == sc->min_freq)
 		return;
 
 	temp_cur = aw_thermal_gettemp(sc, 0);
 	temp_alarm = aw_thermal_getalarm(sc, 0);
 
 	if (temp_cur < temp_alarm)
 		aw_thermal_throttle(sc, 0);
 	else
 		*status = ENXIO;
 }
 
 static void
 aw_thermal_intr(void *arg)
 {
 	struct aw_thermal_softc *sc;
 	device_t dev;
 	uint32_t ints;
 
 	dev = arg;
 	sc = device_get_softc(dev);
 
 	ints = RD4(sc, THS_INTS);
 	WR4(sc, THS_INTS, ints);
 
 	if ((ints & SHUT_INT_ALL) != 0) {
 		device_printf(dev,
 		    "WARNING - current temperature exceeds safe limits\n");
 		shutdown_nice(RB_POWEROFF);
 	}
 
 	if ((ints & ALARM_INT_ALL) != 0)
-		aw_thermal_throttle(sc, 1);
+		taskqueue_enqueue(taskqueue_thread, &sc->cf_task);
 }
 
 static int
 aw_thermal_probe(device_t dev)
 {
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
 	if (THS_CONF(dev) == NULL)
 		return (ENXIO);
 
 	device_set_desc(dev, "Allwinner Thermal Sensor Controller");
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 aw_thermal_attach(device_t dev)
 {
 	struct aw_thermal_softc *sc;
 	clk_t clk_ahb, clk_ths;
 	hwreset_t rst;
 	int i, error;
 	void *ih;
 
 	sc = device_get_softc(dev);
 	clk_ahb = clk_ths = NULL;
 	rst = NULL;
 	ih = NULL;
 
 	sc->conf = THS_CONF(dev);
+	TASK_INIT(&sc->cf_task, 0, aw_thermal_cf_task, sc);
 
 	if (bus_alloc_resources(dev, aw_thermal_spec, sc->res) != 0) {
 		device_printf(dev, "cannot allocate resources for device\n");
 		return (ENXIO);
 	}
 
 	if (clk_get_by_ofw_name(dev, 0, "ahb", &clk_ahb) == 0) {
 		error = clk_enable(clk_ahb);
 		if (error != 0) {
 			device_printf(dev, "cannot enable ahb clock\n");
 			goto fail;
 		}
 	}
 	if (clk_get_by_ofw_name(dev, 0, "ths", &clk_ths) == 0) {
 		error = clk_set_freq(clk_ths, sc->conf->clk_rate, 0);
 		if (error != 0) {
 			device_printf(dev, "cannot set ths clock rate\n");
 			goto fail;
 		}
 		error = clk_enable(clk_ths);
 		if (error != 0) {
 			device_printf(dev, "cannot enable ths clock\n");
 			goto fail;
 		}
 	}
 	if (hwreset_get_by_ofw_idx(dev, 0, 0, &rst) == 0) {
 		error = hwreset_deassert(rst);
 		if (error != 0) {
 			device_printf(dev, "cannot de-assert reset\n");
 			goto fail;
 		}
 	}
 
 	error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_MISC | INTR_MPSAFE,
 	    NULL, aw_thermal_intr, dev, &ih);
 	if (error != 0) {
 		device_printf(dev, "cannot setup interrupt handler\n");
 		goto fail;
 	}
 
 	if (aw_thermal_init(sc) != 0)
 		goto fail;
 
 	for (i = 0; i < sc->conf->nsensors; i++)
 		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
 		    OID_AUTO, sc->conf->sensors[i].name,
 		    CTLTYPE_INT | CTLFLAG_RD,
 		    sc, i, aw_thermal_sysctl, "IK0",
 		    sc->conf->sensors[i].desc);
 
 	if (bootverbose)
 		for (i = 0; i < sc->conf->nsensors; i++) {
 			device_printf(dev,
 			    "#%d: alarm %dC hyst %dC shut %dC\n", i,
 			    aw_thermal_getalarm(sc, i) - TEMP_C_TO_K,
 			    aw_thermal_gethyst(sc, i) - TEMP_C_TO_K,
 			    aw_thermal_getshut(sc, i) - TEMP_C_TO_K);
 		}
 
 	sc->cf_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change,
 	    aw_thermal_cf_pre_change, sc, EVENTHANDLER_PRI_FIRST);
 
 	return (0);
 
 fail:
 	if (ih != NULL)
 		bus_teardown_intr(dev, sc->res[1], ih);
 	if (rst != NULL)
 		hwreset_release(rst);
 	if (clk_ahb != NULL)
 		clk_release(clk_ahb);
 	if (clk_ths != NULL)
 		clk_release(clk_ths);
 	bus_release_resources(dev, aw_thermal_spec, sc->res);
 
 	return (ENXIO);
 }
 
 static device_method_t aw_thermal_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		aw_thermal_probe),
 	DEVMETHOD(device_attach,	aw_thermal_attach),
 
 	DEVMETHOD_END
 };
 
 static driver_t aw_thermal_driver = {
 	"aw_thermal",
 	aw_thermal_methods,
 	sizeof(struct aw_thermal_softc),
 };
 
 static devclass_t aw_thermal_devclass;
 
 DRIVER_MODULE(aw_thermal, simplebus, aw_thermal_driver, aw_thermal_devclass,
     0, 0);
 MODULE_VERSION(aw_thermal, 1);
Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner	(revision 307896)
@@ -1,57 +1,57 @@
 # $FreeBSD$
 kern/kern_clocksource.c			standard
 
 arm/allwinner/a10_ahci.c		optional	ahci
 arm/allwinner/a10_codec.c		optional	sound
 arm/allwinner/a10_common.c		standard
 arm/allwinner/a10_dmac.c		standard
 arm/allwinner/a10_ehci.c		optional	ehci
 arm/allwinner/aw_usbphy.c		optional	ehci
 arm/allwinner/a10_gpio.c		optional	gpio
 arm/allwinner/a10_mmc.c			optional	mmc
 arm/allwinner/a10_sramc.c		standard
 arm/allwinner/aw_nmi.c			optional	intrng
 arm/allwinner/aw_if_dwc.c		optional	dwc
-arm/allwinner/aw_rsb.c			optional	rsb
+arm/allwinner/aw_rsb.c			optional	rsb | p2wi
 arm/allwinner/aw_rtc.c			standard
 arm/allwinner/aw_ts.c			standard
 arm/allwinner/aw_wdog.c			standard
 arm/allwinner/aw_machdep.c		standard
 arm/allwinner/aw_mp.c			optional	smp
 arm/allwinner/axp209.c			optional	axp209
 arm/allwinner/axp81x.c			optional	axp81x
 arm/allwinner/if_awg.c			optional	awg
 arm/allwinner/if_emac.c			optional	emac
 arm/allwinner/sunxi_dma_if.m		standard
 dev/iicbus/twsi/a10_twsi.c		optional	twsi
 dev/usb/controller/generic_ohci.c	optional	ohci
 dev/usb/controller/generic_usb_if.m	optional	ohci
 arm/allwinner/aw_sid.c			standard
 arm/allwinner/aw_thermal.c		standard
 dev/iicbus/sy8106a.c			optional	sy8106a
 #arm/allwinner/console.c		standard
 
 arm/allwinner/a10_fb.c			optional	vt
 arm/allwinner/a10_hdmi.c		optional	hdmi
 arm/allwinner/a10_hdmiaudio.c		optional	hdmi sound
 arm/arm/hdmi_if.m			optional	hdmi
 
 arm/allwinner/aw_reset.c		standard
 arm/allwinner/aw_ccu.c			standard
 arm/allwinner/clk/aw_ahbclk.c		standard
 arm/allwinner/clk/aw_apbclk.c		standard
 arm/allwinner/clk/aw_axiclk.c		standard
 arm/allwinner/clk/aw_codecclk.c		standard
 arm/allwinner/clk/aw_cpuclk.c		standard
 arm/allwinner/clk/aw_cpusclk.c		standard
 arm/allwinner/clk/aw_debeclk.c		standard
 arm/allwinner/clk/aw_gate.c		standard
 arm/allwinner/clk/aw_gmacclk.c		standard
 arm/allwinner/clk/aw_hdmiclk.c		standard
 arm/allwinner/clk/aw_lcdclk.c		standard
 arm/allwinner/clk/aw_modclk.c		standard
 arm/allwinner/clk/aw_mmcclk.c		standard
 arm/allwinner/clk/aw_oscclk.c		standard
 arm/allwinner/clk/aw_pll.c		standard
 arm/allwinner/clk/aw_thsclk.c		standard
 arm/allwinner/clk/aw_usbclk.c		standard
Index: user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC	(revision 307896)
@@ -1,176 +1,178 @@
 #
 # GENERICV6 -- Generic(ish) kernel config.
 #
 # For more information on this file, please read the config(5) manual page,
 # and/or the handbook section on Kernel Configuration Files:
 #
 #    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files.
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
 # $FreeBSD$
 
 ident		GENERIC
 
 cpu		CPU_CORTEXA_MP
 machine 	arm armv6
 makeoptions	CONF_CFLAGS="-march=armv7a"
 
 makeoptions	KERNVIRTADDR=0xc0000000
 options 	KERNVIRTADDR=0xc0000000
 
 include 	"std.armv6"
 files		"../allwinner/files.allwinner"
 files		"../allwinner/a20/files.a20"
 files		"../allwinner/a31/files.a31"
 files		"../allwinner/a83t/files.a83t"
 files		"../allwinner/h3/files.h3"
 files		"../broadcom/bcm2835/files.bcm2836"
 files		"../broadcom/bcm2835/files.bcm283x"
 files		"../nvidia/tegra124/files.tegra124"
 files		"../qemu/files.qemu"
 
 options 	SOC_ALLWINNER_A20
 options 	SOC_ALLWINNER_A31
 options 	SOC_ALLWINNER_A31S
 options 	SOC_ALLWINNER_A83T
 options 	SOC_ALLWINNER_H3
 options 	SOC_BCM2836
 
 options 	SCHED_ULE		# ULE scheduler
 options 	SMP			# Enable multiple cores
 options 	PLATFORM
 options 	PLATFORM_SMP
 options 	MULTIDELAY
 options 	LINUX_BOOT_ABI
 
 # EXT_RESOURCES pseudo devices
 options 	EXT_RESOURCES
 device		clk
 device		phy
 device		hwreset
 device		regulator
 
 # CPU frequency control
 device		cpufreq
 
 # Interrupt controller
 options 	INTRNG
 device		gic
 
 # ARM Generic Timer
 device		generic_timer
 
 # MMC/SD/SDIO Card slot support
 device		sdhci			# SD controller
 device		mmc			# mmc/sd bus
 device		mmcsd			# mmc/sd flash cards
 
 # ATA controllers
 device		ahci			# AHCI-compatible SATA controllers
 #device		ata			# Legacy ATA/SATA controllers
 
 # PCI
 options 	NEW_PCIB
 device		pci
 
 # PCI NICs
 device		re			# RealTek 8139C+/8169/8169S/8110S
 
 # VirtIO
 device		virtio
 device		virtio_mmio
 device		virtio_blk
 device		vtnet
 
 # Console and misc
 device		uart
 device		uart_ns8250
 device		uart_snps
 device		pl011
 device		pty
 device		snp
 device		md			# Memory "disks"
 device		random			# Entropy device
 device		psci
 
 # I2C support
 device		iicbus
 device		iic
 device		twsi
-device		rsb
+device		rsb			# Allwinner Reduced Serial Bus
+device		p2wi			# Allwinner Push-Pull Two Wire
 device		axp209			# AXP209 Power Management Unit
 device		axp81x			# AXP813/818 Power Management Unit
 device		bcm2835_bsc
 device		icee
+device		sy8106a			# SY8106A Buck Regulator
 
 # GPIO
 device		gpio
 device		gpioled
 device		gpioregulator
 
 # SPI
 device		spibus
 device		bcm2835_spi
 
 device		scbus			# SCSI bus (required for ATA/SCSI)
 device		da			# Direct Access (disks)
 device		cd			# CD
 device		pass			# Passthrough device (direct ATA/SCSI access)
 
 # USB support
 options 	USB_HOST_ALIGN=64	# Align usb buffers to cache line size.
 device		usb
 #device		uhci
 device		ohci
 device		ehci
 device		dwcotg			# DWC OTG controller
 
 device		umass			# Disks/Mass storage - Requires scbus and da
 device		uhid			# "Human Interface Devices"
 device		ukbd			# Allow keyboard like HIDs to control console
 
 # Ethernet
 device		loop
 device		ether
 device		vlan			# 802.1Q VLAN support
 device		mii
 device		bpf
 
 #device		emac			# 10/100 integrated EMAC controller
 device		dwc			# 10/100/1000 integrated GMAC controller
 device		awg			# 10/100/1000 integrated EMAC controller
 
 # USB ethernet support, requires miibus
 device		smcphy
 device		smsc
 device		miibus
 
 # Sound support
 device		sound
 
 # Framebuffer support
 device		vt
 device		kbdmux
 device		ums
 device		videomode
 device		hdmi
 device		vchiq
 
 # Pinmux
 device		fdt_pinctrl
 
 # Extensible Firmware Interface
 options 	EFI
 
 # Flattened Device Tree
 options 	FDT			# Configure using FDT/DTB data
 makeoptions	MODULES_EXTRA="dtb/allwinner dtb/nvidia dtb/rpi"
Index: user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/Makefile
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/Makefile	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/Makefile	(revision 307896)
@@ -1,44 +1,45 @@
 # $FreeBSD$
 
 LIB=	efi
 INTERNALLIB=
 WARNS?=	2
 
 SRCS=	delay.c devpath.c efi_console.c efinet.c efipart.c env.c errno.c \
 	handles.c libefi.c
 
 .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
 SRCS+=	time.c
 .elif ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm"
 SRCS+=	time_event.c
 .endif
 
 # We implement a slightly non-standard %S in that it always takes a
 # CHAR16 that's common in UEFI-land instead of a wchar_t. This only
 # seems to matter on arm64 where wchar_t defaults to an int instead
 # of a short. There's no good cast to use here so just ignore the
 # warnings for now.
 CWARNFLAGS.efinet.c+=	-Wno-format
 
 .if ${MACHINE_CPUARCH} == "aarch64"
 CFLAGS+=	-msoft-float -mgeneral-regs-only
 .endif
 .if ${MACHINE_ARCH} == "amd64"
 CFLAGS+= -fPIC -mno-red-zone
 .endif
+CFLAGS+= -I${.CURDIR}/../../ficl -I${.CURDIR}/../../ficl/${MACHINE}
 CFLAGS+= -I${.CURDIR}/../include
 CFLAGS+= -I${.CURDIR}/../include/${MACHINE}
 CFLAGS+= -I${.CURDIR}/../../../../lib/libstand
 
 # Pick up the bootstrap header for some interface items
 CFLAGS+= -I${.CURDIR}/../../common
 
 # Handle FreeBSD specific %b and %D printf format specifiers
 CFLAGS+= ${FORMAT_EXTENSIONS}
 
 # Do not use TERM_EMU on arm and arm64 as it doesn't behave well with serial console
 .if ${MACHINE_CPUARCH} != "arm" && ${MACHINE_CPUARCH} != "aarch64"
 CFLAGS+= -DTERM_EMU
 .endif
 
 .include <bsd.lib.mk>
Index: user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/env.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/env.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/env.c	(revision 307896)
@@ -1,55 +1,234 @@
 /*
  * Copyright (c) 2015 Netflix, Inc. All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <stand.h>
+#include <string.h>
 #include <efi.h>
 #include <efilib.h>
+#include <uuid.h>
+#include "bootstrap.h"
+#include "ficl.h"
 
+int efi_variable_support = 1;
+
 /*
  * Simple wrappers to the underlying UEFI functions.
  * See http://wiki.phoenix.com/wiki/index.php/EFI_RUNTIME_SERVICES
  * for details.
  */
 EFI_STATUS
 efi_get_next_variable_name(UINTN *variable_name_size, CHAR16 *variable_name, EFI_GUID *vendor_guid)
 {
 	return RS->GetNextVariableName(variable_name_size, variable_name, vendor_guid);
 }
 
 EFI_STATUS
 efi_get_variable(CHAR16 *variable_name, EFI_GUID *vendor_guid, UINT32 *attributes, UINTN *data_size,
     void *data)
 {
 	return RS->GetVariable(variable_name, vendor_guid, attributes, data_size, data);
 }
 
 EFI_STATUS
 efi_set_variable(CHAR16 *variable_name, EFI_GUID *vendor_guid, UINT32 attributes, UINTN data_size,
     void *data)
 {
 	return RS->SetVariable(variable_name, vendor_guid, attributes, data_size, data);
 }
+
+/*
+ *		FreeBSD's loader interaction words and extras
+ *
+ * 		efi-setenv  ( value n name n guid n attr -- 0 | -1)
+ * 		efi-getenv  ( guid n addr n -- addr' n' | -1 )
+ * 		efi-unsetenv ( name n guid n'' -- )
+ */
+
+/*
+ * efi-setenv
+ * 		efi-setenv  ( value n name n guid n attr -- 0 | -1)
+ *
+ * Set environment variables using the SetVariable EFI runtime service.
+ *
+ * Value and guid are passed through in binary form (so guid needs to be
+ * converted to binary form from its string form). Name is converted from
+ * ASCII to CHAR16. Since ficl doesn't have support for internationalization,
+ * there's no native CHAR16 interface provided.
+ *
+ * attr is an int in the bitmask of the following attributes for this variable.
+ *
+ *	1	Non volatile
+ *	2	Boot service access
+ *	4	Run time access
+ * (corresponding to the same bits in the UEFI spec).
+ */
+void
+ficlEfiSetenv(FICL_VM *pVM)
+{
+#ifndef TESTMAIN
+	char	*value = NULL, *guid = NULL;
+	CHAR16	*name = NULL;
+	int	i;
+#endif
+	char	*namep, *valuep, *guidp;
+	int	names, values, guids, attr;
+	int	status;
+	uuid_t	u;
+	uint32_t ustatus;
+
+#if FICL_ROBUST > 1
+	vmCheckStack(pVM, 6, 0);
+#endif
+	attr = stackPopINT(pVM->pStack);
+	guids = stackPopINT(pVM->pStack);
+	guidp = (char*)stackPopPtr(pVM->pStack);
+	names = stackPopINT(pVM->pStack);
+	namep = (char*)stackPopPtr(pVM->pStack);
+	values = stackPopINT(pVM->pStack);
+	valuep = (char*)stackPopPtr(pVM->pStack);
+
+#ifndef TESTMAIN
+	guid = (char*)ficlMalloc(guids);
+	if (guid != NULL)
+		vmThrowErr(pVM, "Error: out of memory");
+	memcpy(guid, guidp, guids);
+	uuid_from_string(guid, &u, &ustatus);
+	if (ustatus != uuid_s_ok) {
+		stackPushINT(pVM->pStack, -1);
+		goto out;
+	}
+
+	name = (CHAR16 *)ficlMalloc((names + 1) * sizeof(CHAR16));
+	if (name == NULL)
+		vmThrowErr(pVM, "Error: out of memory");
+	for (i = 0; i < names; i++)
+		name[i] = namep[i];
+	name[names] = (CHAR16)0;
+
+	value = (char*)ficlMalloc(values + 1);
+	if (value != NULL)
+		vmThrowErr(pVM, "Error: out of memory");
+	memcpy(value, valuep, values);
+
+	status = efi_set_variable(name, (EFI_GUID *)&u, attr, values, value);
+	if (status == EFI_SUCCESS)
+		stackPushINT(pVM->pStack, 0);
+	else
+		stackPushINT(pVM->pStack, -1);
+out:
+	ficlFree(name);
+	ficlFree(value);
+	ficlFree(guid);
+#endif
+
+	return;
+}
+
+void
+ficlEfiGetenv(FICL_VM *pVM)
+{
+#ifndef TESTMAIN
+	char	*name, *value;
+#endif
+	char	*namep;
+	int	names;
+
+#if FICL_ROBUST > 1
+	vmCheckStack(pVM, 2, 2);
+#endif
+	names = stackPopINT(pVM->pStack);
+	namep = (char*) stackPopPtr(pVM->pStack);
+
+#ifndef TESTMAIN
+	name = (char*) ficlMalloc(names+1);
+	if (!name)
+		vmThrowErr(pVM, "Error: out of memory");
+	strncpy(name, namep, names);
+	name[names] = '\0';
+
+	value = getenv(name);
+	ficlFree(name);
+
+	if(value != NULL) {
+		stackPushPtr(pVM->pStack, value);
+		stackPushINT(pVM->pStack, strlen(value));
+	} else
+#endif
+		stackPushINT(pVM->pStack, -1);
+
+	return;
+}
+
+void
+ficlEfiUnsetenv(FICL_VM *pVM)
+{
+#ifndef TESTMAIN
+	char	*name;
+#endif
+	char	*namep;
+	int	names;
+
+#if FICL_ROBUST > 1
+	vmCheckStack(pVM, 2, 0);
+#endif
+	names = stackPopINT(pVM->pStack);
+	namep = (char*) stackPopPtr(pVM->pStack);
+
+#ifndef TESTMAIN
+	name = (char*) ficlMalloc(names+1);
+	if (!name)
+		vmThrowErr(pVM, "Error: out of memory");
+	strncpy(name, namep, names);
+	name[names] = '\0';
+
+	unsetenv(name);
+	ficlFree(name);
+#endif
+
+	return;
+}
+
+/**************************************************************************
+** Add FreeBSD UEFI platform extensions into the system dictionary
+**************************************************************************/
+void ficlEfiCompilePlatform(FICL_SYSTEM *pSys)
+{
+    FICL_DICT *dp = pSys->dp;
+    assert (dp);
+
+    dictAppendWord(dp, "efi-setenv",    ficlEfiSetenv,	    FW_DEFAULT);
+    dictAppendWord(dp, "efi-getenv",    ficlEfiGetenv,	    FW_DEFAULT);
+    dictAppendWord(dp, "efi-unsetenv",  ficlEfiUnsetenv,    FW_DEFAULT);
+
+    /* Would like to export the EFI version, but this will do for now */
+    ficlSetEnv(pSys, "efi-boot", 1);
+
+    return;
+}
+
+FICL_COMPILE_SET(ficlEfiCompilePlatform);
Index: user/alc/PQ_LAUNDRY/sys/boot/efi/loader/main.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/efi/loader/main.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/boot/efi/loader/main.c	(revision 307896)
@@ -1,1075 +1,1079 @@
 /*-
  * Copyright (c) 2008-2010 Rui Paulo
  * Copyright (c) 2006 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/reboot.h>
 #include <sys/boot.h>
 #include <inttypes.h>
 #include <stand.h>
 #include <string.h>
 #include <setjmp.h>
 
 #include <efi.h>
 #include <efilib.h>
 
 #include <uuid.h>
 
 #include <bootstrap.h>
 #include <smbios.h>
 
 #ifdef EFI_ZFS_BOOT
 #include <libzfs.h>
 #endif
 
 #include "loader_efi.h"
 
 extern char bootprog_name[];
 extern char bootprog_rev[];
 extern char bootprog_date[];
 extern char bootprog_maker[];
 
+/* Force a reference to bring in EFI support from the library */
+extern int efi_variable_support;
+int *dummy1 = &efi_variable_support;
+
 struct arch_switch archsw;	/* MI/MD interface boundary */
 
 EFI_GUID acpi = ACPI_TABLE_GUID;
 EFI_GUID acpi20 = ACPI_20_TABLE_GUID;
 EFI_GUID devid = DEVICE_PATH_PROTOCOL;
 EFI_GUID imgid = LOADED_IMAGE_PROTOCOL;
 EFI_GUID mps = MPS_TABLE_GUID;
 EFI_GUID netid = EFI_SIMPLE_NETWORK_PROTOCOL;
 EFI_GUID smbios = SMBIOS_TABLE_GUID;
 EFI_GUID dxe = DXE_SERVICES_TABLE_GUID;
 EFI_GUID hoblist = HOB_LIST_TABLE_GUID;
 EFI_GUID memtype = MEMORY_TYPE_INFORMATION_TABLE_GUID;
 EFI_GUID debugimg = DEBUG_IMAGE_INFO_TABLE_GUID;
 EFI_GUID fdtdtb = FDT_TABLE_GUID;
 EFI_GUID inputid = SIMPLE_TEXT_INPUT_PROTOCOL;
 
 #ifdef EFI_ZFS_BOOT
 static void efi_zfs_probe(void);
 #endif
 
 /*
  * cpy8to16 copies a traditional C string into a CHAR16 string and
  * 0 terminates it. len is the size of *dst in bytes.
  */
 static void
 cpy8to16(const char *src, CHAR16 *dst, size_t len)
 {
 	len <<= 1;		/* Assume CHAR16 is 2 bytes */
 	while (len > 0 && *src) {
 		*dst++ = *src++;
 		len--;
 	}
 	*dst++ = (CHAR16)0;
 }
 
 static void
 cpy16to8(const CHAR16 *src, char *dst, size_t len)
 {
 	size_t i;
 
 	for (i = 0; i < len && src[i]; i++)
 		dst[i] = (char)src[i];
 	if (i < len)
 		dst[i] = '\0';
 }
 
 static int
 has_keyboard(void)
 {
 	EFI_STATUS status;
 	EFI_DEVICE_PATH *path;
 	EFI_HANDLE *hin, *hin_end, *walker;
 	UINTN sz;
 	int retval = 0;
 	
 	/*
 	 * Find all the handles that support the SIMPLE_TEXT_INPUT_PROTOCOL and
 	 * do the typical dance to get the right sized buffer.
 	 */
 	sz = 0;
 	hin = NULL;
 	status = BS->LocateHandle(ByProtocol, &inputid, 0, &sz, 0);
 	if (status == EFI_BUFFER_TOO_SMALL) {
 		hin = (EFI_HANDLE *)malloc(sz);
 		status = BS->LocateHandle(ByProtocol, &inputid, 0, &sz,
 		    hin);
 		if (EFI_ERROR(status))
 			free(hin);
 	}
 	if (EFI_ERROR(status))
 		return retval;
 
 	/*
 	 * Look at each of the handles. If it supports the device path protocol,
 	 * use it to get the device path for this handle. Then see if that
 	 * device path matches either the USB device path for keyboards or the
 	 * legacy device path for keyboards.
 	 */
 	hin_end = &hin[sz / sizeof(*hin)];
 	for (walker = hin; walker < hin_end; walker++) {
 		status = BS->HandleProtocol(*walker, &devid, (VOID **)&path);
 		if (EFI_ERROR(status))
 			continue;
 
 		while (!IsDevicePathEnd(path)) {
 			/*
 			 * Check for the ACPI keyboard node. All PNP3xx nodes
 			 * are keyboards of different flavors. Note: It is
 			 * unclear of there's always a keyboard node when
 			 * there's a keyboard controller, or if there's only one
 			 * when a keyboard is detected at boot.
 			 */
 			if (DevicePathType(path) == ACPI_DEVICE_PATH &&
 			    (DevicePathSubType(path) == ACPI_DP ||
 				DevicePathSubType(path) == ACPI_EXTENDED_DP)) {
 				ACPI_HID_DEVICE_PATH  *acpi;
 
 				acpi = (ACPI_HID_DEVICE_PATH *)(void *)path;
 				if ((EISA_ID_TO_NUM(acpi->HID) & 0xff00) == 0x300 &&
 				    (acpi->HID & 0xffff) == PNP_EISA_ID_CONST) {
 					retval = 1;
 					goto out;
 				}
 			/*
 			 * Check for USB keyboard node, if present. Unlike a
 			 * PS/2 keyboard, these definitely only appear when
 			 * connected to the system.
 			 */
 			} else if (DevicePathType(path) == MESSAGING_DEVICE_PATH &&
 			    DevicePathSubType(path) == MSG_USB_CLASS_DP) {
 				USB_CLASS_DEVICE_PATH *usb;
 			       
 				usb = (USB_CLASS_DEVICE_PATH *)(void *)path;
 				if (usb->DeviceClass == 3 && /* HID */
 				    usb->DeviceSubClass == 1 && /* Boot devices */
 				    usb->DeviceProtocol == 1) { /* Boot keyboards */
 					retval = 1;
 					goto out;
 				}
 			}
 			path = NextDevicePathNode(path);
 		}
 	}
 out:
 	free(hin);
 	return retval;
 }
 
 static int
 find_currdev(EFI_LOADED_IMAGE *img, struct devsw **dev, int *unit,
     uint64_t *extra)
 {
 	EFI_DEVICE_PATH *devpath, *copy;
 	EFI_HANDLE h;
 
 	/*
 	 * Try the device handle from our loaded image first.  If that
 	 * fails, use the device path from the loaded image and see if
 	 * any of the nodes in that path match one of the enumerated
 	 * handles.
 	 */
 	if (efi_handle_lookup(img->DeviceHandle, dev, unit, extra) == 0)
 		return (0);
 
 	copy = NULL;
 	devpath = efi_lookup_image_devpath(IH);
 	while (devpath != NULL) {
 		h = efi_devpath_handle(devpath);
 		if (h == NULL)
 			break;
 
 		if (efi_handle_lookup(h, dev, unit, extra) == 0) {
 			if (copy != NULL)
 				free(copy);
 			return (0);
 		}
 
 		if (copy != NULL)
 			free(copy);
 		devpath = efi_lookup_devpath(h);
 		if (devpath != NULL) {
 			copy = efi_devpath_trim(devpath);
 			devpath = copy;
 		}
 	}
 
 	return (ENOENT);
 }
 
 EFI_STATUS
 main(int argc, CHAR16 *argv[])
 {
 	char var[128];
 	EFI_LOADED_IMAGE *img;
 	EFI_GUID *guid;
 	int i, j, vargood, unit, howto;
 	struct devsw *dev;
 	uint64_t pool_guid;
 	UINTN k;
 	int has_kbd;
 	char buf[40];
 
 	archsw.arch_autoload = efi_autoload;
 	archsw.arch_getdev = efi_getdev;
 	archsw.arch_copyin = efi_copyin;
 	archsw.arch_copyout = efi_copyout;
 	archsw.arch_readin = efi_readin;
 #ifdef EFI_ZFS_BOOT
 	/* Note this needs to be set before ZFS init. */
 	archsw.arch_zfs_probe = efi_zfs_probe;
 #endif
 
 	/* Init the time source */
 	efi_time_init();
 
 	has_kbd = has_keyboard();
 
 	/*
 	 * XXX Chicken-and-egg problem; we want to have console output
 	 * early, but some console attributes may depend on reading from
 	 * eg. the boot device, which we can't do yet.  We can use
 	 * printf() etc. once this is done.
 	 */
 	cons_probe();
 
 	/*
 	 * Initialise the block cache. Set the upper limit.
 	 */
 	bcache_init(32768, 512);
 
 	/*
 	 * Parse the args to set the console settings, etc
 	 * boot1.efi passes these in, if it can read /boot.config or /boot/config
 	 * or iPXE may be setup to pass these in.
 	 *
 	 * Loop through the args, and for each one that contains an '=' that is
 	 * not the first character, add it to the environment.  This allows
 	 * loader and kernel env vars to be passed on the command line.  Convert
 	 * args from UCS-2 to ASCII (16 to 8 bit) as they are copied.
 	 */
 	howto = 0;
 	for (i = 1; i < argc; i++) {
 		if (argv[i][0] == '-') {
 			for (j = 1; argv[i][j] != 0; j++) {
 				int ch;
 
 				ch = argv[i][j];
 				switch (ch) {
 				case 'a':
 					howto |= RB_ASKNAME;
 					break;
 				case 'd':
 					howto |= RB_KDB;
 					break;
 				case 'D':
 					howto |= RB_MULTIPLE;
 					break;
 				case 'h':
 					howto |= RB_SERIAL;
 					break;
 				case 'm':
 					howto |= RB_MUTE;
 					break;
 				case 'p':
 					howto |= RB_PAUSE;
 					break;
 				case 'P':
 					if (!has_kbd)
 						howto |= RB_SERIAL | RB_MULTIPLE;
 					break;
 				case 'r':
 					howto |= RB_DFLTROOT;
 					break;
 				case 's':
 					howto |= RB_SINGLE;
 					break;
 				case 'S':
 					if (argv[i][j + 1] == 0) {
 						if (i + 1 == argc) {
 							setenv("comconsole_speed", "115200", 1);
 						} else {
 							cpy16to8(&argv[i + 1][0], var,
 							    sizeof(var));
 							setenv("comconsole_speedspeed", var, 1);
 						}
 						i++;
 						break;
 					} else {
 						cpy16to8(&argv[i][j + 1], var,
 						    sizeof(var));
 						setenv("comconsole_speed", var, 1);
 						break;
 					}
 				case 'v':
 					howto |= RB_VERBOSE;
 					break;
 				}
 			}
 		} else {
 			vargood = 0;
 			for (j = 0; argv[i][j] != 0; j++) {
 				if (j == sizeof(var)) {
 					vargood = 0;
 					break;
 				}
 				if (j > 0 && argv[i][j] == '=')
 					vargood = 1;
 				var[j] = (char)argv[i][j];
 			}
 			if (vargood) {
 				var[j] = 0;
 				putenv(var);
 			}
 		}
 	}
 	for (i = 0; howto_names[i].ev != NULL; i++)
 		if (howto & howto_names[i].mask)
 			setenv(howto_names[i].ev, "YES", 1);
 	if (howto & RB_MULTIPLE) {
 		if (howto & RB_SERIAL)
 			setenv("console", "comconsole efi" , 1);
 		else
 			setenv("console", "efi comconsole" , 1);
 	} else if (howto & RB_SERIAL) {
 		setenv("console", "comconsole" , 1);
 	}
 
 	if (efi_copy_init()) {
 		printf("failed to allocate staging area\n");
 		return (EFI_BUFFER_TOO_SMALL);
 	}
 
 	/*
 	 * March through the device switch probing for things.
 	 */
 	for (i = 0; devsw[i] != NULL; i++)
 		if (devsw[i]->dv_init != NULL)
 			(devsw[i]->dv_init)();
 
 	/* Get our loaded image protocol interface structure. */
 	BS->HandleProtocol(IH, &imgid, (VOID**)&img);
 
 	printf("Command line arguments:");
 	for (i = 0; i < argc; i++)
 		printf(" %S", argv[i]);
 	printf("\n");
 
 	printf("Image base: 0x%lx\n", (u_long)img->ImageBase);
 	printf("EFI version: %d.%02d\n", ST->Hdr.Revision >> 16,
 	    ST->Hdr.Revision & 0xffff);
 	printf("EFI Firmware: %S (rev %d.%02d)\n", ST->FirmwareVendor,
 	    ST->FirmwareRevision >> 16, ST->FirmwareRevision & 0xffff);
 
 	printf("\n");
 	printf("%s, Revision %s\n", bootprog_name, bootprog_rev);
 	printf("(%s, %s)\n", bootprog_maker, bootprog_date);
 
 	/*
 	 * Disable the watchdog timer. By default the boot manager sets
 	 * the timer to 5 minutes before invoking a boot option. If we
 	 * want to return to the boot manager, we have to disable the
 	 * watchdog timer and since we're an interactive program, we don't
 	 * want to wait until the user types "quit". The timer may have
 	 * fired by then. We don't care if this fails. It does not prevent
 	 * normal functioning in any way...
 	 */
 	BS->SetWatchdogTimer(0, 0, 0, NULL);
 
 	if (find_currdev(img, &dev, &unit, &pool_guid) != 0)
 		return (EFI_NOT_FOUND);
 
 	switch (dev->dv_type) {
 #ifdef EFI_ZFS_BOOT
 	case DEVT_ZFS: {
 		struct zfs_devdesc currdev;
 
 		currdev.d_dev = dev;
 		currdev.d_unit = unit;
 		currdev.d_type = currdev.d_dev->dv_type;
 		currdev.d_opendata = NULL;
 		currdev.pool_guid = pool_guid;
 		currdev.root_guid = 0;
 		env_setenv("currdev", EV_VOLATILE, efi_fmtdev(&currdev),
 			   efi_setcurrdev, env_nounset);
 		env_setenv("loaddev", EV_VOLATILE, efi_fmtdev(&currdev), env_noset,
 			   env_nounset);
 		init_zfs_bootenv(zfs_fmtdev(&currdev));
 		break;
 	}
 #endif
 	default: {
 		struct devdesc currdev;
 
 		currdev.d_dev = dev;
 		currdev.d_unit = unit;
 		currdev.d_opendata = NULL;
 		currdev.d_type = currdev.d_dev->dv_type;
 		env_setenv("currdev", EV_VOLATILE, efi_fmtdev(&currdev),
 			   efi_setcurrdev, env_nounset);
 		env_setenv("loaddev", EV_VOLATILE, efi_fmtdev(&currdev), env_noset,
 			   env_nounset);
 		break;
 	}
 	}
 
 	snprintf(var, sizeof(var), "%d.%02d", ST->Hdr.Revision >> 16,
 	    ST->Hdr.Revision & 0xffff);
 	env_setenv("efi-version", EV_VOLATILE, var, env_noset, env_nounset);
 	setenv("LINES", "24", 1);	/* optional */
 
 	for (k = 0; k < ST->NumberOfTableEntries; k++) {
 		guid = &ST->ConfigurationTable[k].VendorGuid;
 		if (!memcmp(guid, &smbios, sizeof(EFI_GUID))) {
 			snprintf(buf, sizeof(buf), "%p",
 			    ST->ConfigurationTable[k].VendorTable);
 			setenv("hint.smbios.0.mem", buf, 1);
 			smbios_detect(ST->ConfigurationTable[k].VendorTable);
 			break;
 		}
 	}
 
 	interact(NULL);			/* doesn't return */
 
 	return (EFI_SUCCESS);		/* keep compiler happy */
 }
 
 /* XXX move to lib stand ? */
 static int
 wcscmp(CHAR16 *a, CHAR16 *b)
 {
 
 	while (*a && *b && *a == *b) {
 		a++;
 		b++;
 	}
 	return *a - *b;
 }
 
 
 COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot);
 
 static int
 command_reboot(int argc, char *argv[])
 {
 	int i;
 
 	for (i = 0; devsw[i] != NULL; ++i)
 		if (devsw[i]->dv_cleanup != NULL)
 			(devsw[i]->dv_cleanup)();
 
 	RS->ResetSystem(EfiResetCold, EFI_SUCCESS, 23,
 	    (CHAR16 *)"Reboot from the loader");
 
 	/* NOTREACHED */
 	return (CMD_ERROR);
 }
 
 COMMAND_SET(quit, "quit", "exit the loader", command_quit);
 
 static int
 command_quit(int argc, char *argv[])
 {
 	exit(0);
 	return (CMD_OK);
 }
 
 COMMAND_SET(memmap, "memmap", "print memory map", command_memmap);
 
 static int
 command_memmap(int argc, char *argv[])
 {
 	UINTN sz;
 	EFI_MEMORY_DESCRIPTOR *map, *p;
 	UINTN key, dsz;
 	UINT32 dver;
 	EFI_STATUS status;
 	int i, ndesc;
 	static char *types[] = {
 	    "Reserved",
 	    "LoaderCode",
 	    "LoaderData",
 	    "BootServicesCode",
 	    "BootServicesData",
 	    "RuntimeServicesCode",
 	    "RuntimeServicesData",
 	    "ConventionalMemory",
 	    "UnusableMemory",
 	    "ACPIReclaimMemory",
 	    "ACPIMemoryNVS",
 	    "MemoryMappedIO",
 	    "MemoryMappedIOPortSpace",
 	    "PalCode"
 	};
 
 	sz = 0;
 	status = BS->GetMemoryMap(&sz, 0, &key, &dsz, &dver);
 	if (status != EFI_BUFFER_TOO_SMALL) {
 		printf("Can't determine memory map size\n");
 		return (CMD_ERROR);
 	}
 	map = malloc(sz);
 	status = BS->GetMemoryMap(&sz, map, &key, &dsz, &dver);
 	if (EFI_ERROR(status)) {
 		printf("Can't read memory map\n");
 		return (CMD_ERROR);
 	}
 
 	ndesc = sz / dsz;
 	printf("%23s %12s %12s %8s %4s\n",
 	    "Type", "Physical", "Virtual", "#Pages", "Attr");
 
 	for (i = 0, p = map; i < ndesc;
 	     i++, p = NextMemoryDescriptor(p, dsz)) {
 		printf("%23s %012jx %012jx %08jx ", types[p->Type],
 		   (uintmax_t)p->PhysicalStart, (uintmax_t)p->VirtualStart,
 		   (uintmax_t)p->NumberOfPages);
 		if (p->Attribute & EFI_MEMORY_UC)
 			printf("UC ");
 		if (p->Attribute & EFI_MEMORY_WC)
 			printf("WC ");
 		if (p->Attribute & EFI_MEMORY_WT)
 			printf("WT ");
 		if (p->Attribute & EFI_MEMORY_WB)
 			printf("WB ");
 		if (p->Attribute & EFI_MEMORY_UCE)
 			printf("UCE ");
 		if (p->Attribute & EFI_MEMORY_WP)
 			printf("WP ");
 		if (p->Attribute & EFI_MEMORY_RP)
 			printf("RP ");
 		if (p->Attribute & EFI_MEMORY_XP)
 			printf("XP ");
 		printf("\n");
 	}
 
 	return (CMD_OK);
 }
 
 COMMAND_SET(configuration, "configuration", "print configuration tables",
     command_configuration);
 
 static const char *
 guid_to_string(EFI_GUID *guid)
 {
 	static char buf[40];
 
 	sprintf(buf, "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 	    guid->Data1, guid->Data2, guid->Data3, guid->Data4[0],
 	    guid->Data4[1], guid->Data4[2], guid->Data4[3], guid->Data4[4],
 	    guid->Data4[5], guid->Data4[6], guid->Data4[7]);
 	return (buf);
 }
 
 static int
 command_configuration(int argc, char *argv[])
 {
 	UINTN i;
 
 	printf("NumberOfTableEntries=%lu\n",
 		(unsigned long)ST->NumberOfTableEntries);
 	for (i = 0; i < ST->NumberOfTableEntries; i++) {
 		EFI_GUID *guid;
 
 		printf("  ");
 		guid = &ST->ConfigurationTable[i].VendorGuid;
 		if (!memcmp(guid, &mps, sizeof(EFI_GUID)))
 			printf("MPS Table");
 		else if (!memcmp(guid, &acpi, sizeof(EFI_GUID)))
 			printf("ACPI Table");
 		else if (!memcmp(guid, &acpi20, sizeof(EFI_GUID)))
 			printf("ACPI 2.0 Table");
 		else if (!memcmp(guid, &smbios, sizeof(EFI_GUID)))
 			printf("SMBIOS Table %p",
 			    ST->ConfigurationTable[i].VendorTable);
 		else if (!memcmp(guid, &dxe, sizeof(EFI_GUID)))
 			printf("DXE Table");
 		else if (!memcmp(guid, &hoblist, sizeof(EFI_GUID)))
 			printf("HOB List Table");
 		else if (!memcmp(guid, &memtype, sizeof(EFI_GUID)))
 			printf("Memory Type Information Table");
 		else if (!memcmp(guid, &debugimg, sizeof(EFI_GUID)))
 			printf("Debug Image Info Table");
 		else if (!memcmp(guid, &fdtdtb, sizeof(EFI_GUID)))
 			printf("FDT Table");
 		else
 			printf("Unknown Table (%s)", guid_to_string(guid));
 		printf(" at %p\n", ST->ConfigurationTable[i].VendorTable);
 	}
 
 	return (CMD_OK);
 }
 
 
 COMMAND_SET(mode, "mode", "change or display EFI text modes", command_mode);
 
 static int
 command_mode(int argc, char *argv[])
 {
 	UINTN cols, rows;
 	unsigned int mode;
 	int i;
 	char *cp;
 	char rowenv[8];
 	EFI_STATUS status;
 	SIMPLE_TEXT_OUTPUT_INTERFACE *conout;
 	extern void HO(void);
 
 	conout = ST->ConOut;
 
 	if (argc > 1) {
 		mode = strtol(argv[1], &cp, 0);
 		if (cp[0] != '\0') {
 			printf("Invalid mode\n");
 			return (CMD_ERROR);
 		}
 		status = conout->QueryMode(conout, mode, &cols, &rows);
 		if (EFI_ERROR(status)) {
 			printf("invalid mode %d\n", mode);
 			return (CMD_ERROR);
 		}
 		status = conout->SetMode(conout, mode);
 		if (EFI_ERROR(status)) {
 			printf("couldn't set mode %d\n", mode);
 			return (CMD_ERROR);
 		}
 		sprintf(rowenv, "%u", (unsigned)rows);
 		setenv("LINES", rowenv, 1);
 		HO();		/* set cursor */
 		return (CMD_OK);
 	}
 
 	printf("Current mode: %d\n", conout->Mode->Mode);
 	for (i = 0; i <= conout->Mode->MaxMode; i++) {
 		status = conout->QueryMode(conout, i, &cols, &rows);
 		if (EFI_ERROR(status))
 			continue;
 		printf("Mode %d: %u columns, %u rows\n", i, (unsigned)cols,
 		    (unsigned)rows);
 	}
 
 	if (i != 0)
 		printf("Select a mode with the command \"mode <number>\"\n");
 
 	return (CMD_OK);
 }
 
 #ifdef EFI_ZFS_BOOT
 COMMAND_SET(lszfs, "lszfs", "list child datasets of a zfs dataset",
     command_lszfs);
 
 static int
 command_lszfs(int argc, char *argv[])
 {
 	int err;
 
 	if (argc != 2) {
 		command_errmsg = "wrong number of arguments";
 		return (CMD_ERROR);
 	}
 
 	err = zfs_list(argv[1]);
 	if (err != 0) {
 		command_errmsg = strerror(err);
 		return (CMD_ERROR);
 	}
 	return (CMD_OK);
 }
 
 COMMAND_SET(reloadbe, "reloadbe", "refresh the list of ZFS Boot Environments",
 	    command_reloadbe);
 
 static int
 command_reloadbe(int argc, char *argv[])
 {
 	int err;
 	char *root;
 
 	if (argc > 2) {
 		command_errmsg = "wrong number of arguments";
 		return (CMD_ERROR);
 	}
 
 	if (argc == 2) {
 		err = zfs_bootenv(argv[1]);
 	} else {
 		root = getenv("zfs_be_root");
 		if (root == NULL) {
 			return (CMD_OK);
 		}
 		err = zfs_bootenv(root);
 	}
 
 	if (err != 0) {
 		command_errmsg = strerror(err);
 		return (CMD_ERROR);
 	}
 
 	return (CMD_OK);
 }
 #endif
 
 COMMAND_SET(efishow, "efi-show", "print some or all EFI variables", command_efi_show);
 
 static int
 efi_print_var(CHAR16 *varnamearg, EFI_GUID *matchguid, int lflag)
 {
 	UINTN		datasz, i;
 	EFI_STATUS	status;
 	UINT32		attr;
 	CHAR16		*data;
 	char		*str;
 	uint32_t	uuid_status;
 	int		is_ascii;
 
 	datasz = 0;
 	status = RS->GetVariable(varnamearg, matchguid, &attr,
 	    &datasz, NULL);
 	if (status != EFI_BUFFER_TOO_SMALL) {
 		printf("Can't get the variable: error %#lx\n", status);
 		return (CMD_ERROR);
 	}
 	data = malloc(datasz);
 	status = RS->GetVariable(varnamearg, matchguid, &attr,
 	    &datasz, data);
 	if (status != EFI_SUCCESS) {
 		printf("Can't get the variable: error %#lx\n", status);
 		return (CMD_ERROR);
 	}
 	uuid_to_string((uuid_t *)matchguid, &str, &uuid_status);
 	if (lflag) {
 		printf("%s 0x%x %S", str, attr, varnamearg);
 	} else {
 		printf("%s 0x%x %S=", str, attr, varnamearg);
 		is_ascii = 1;
 		free(str);
 		str = (char *)data;
 		for (i = 0; i < datasz - 1; i++) {
 			/* Quick hack to see if this ascii-ish string printable range plus tab, cr and lf */
 			if ((str[i] < 32 || str[i] > 126) && str[i] != 9 && str[i] != 10 && str[i] != 13) {
 				is_ascii = 0;
 				break;
 			}
 		}
 		if (str[datasz - 1] != '\0')
 			is_ascii = 0;
 		if (is_ascii)
 			printf("%s", str);
 		else {
 			for (i = 0; i < datasz / 2; i++) {
 				if (isalnum(data[i]) || isspace(data[i]))
 					printf("%c", data[i]);
 				else
 					printf("\\x%02x", data[i]);
 			}
 		}
 	}
 	free(data);
 	if (pager_output("\n"))
 		return (CMD_WARN);
 	return (CMD_OK);
 }
 
 static int
 command_efi_show(int argc, char *argv[])
 {
 	/*
 	 * efi-show [-a]
 	 *	print all the env
 	 * efi-show -u UUID
 	 *	print all the env vars tagged with UUID
 	 * efi-show -v var
 	 *	search all the env vars and print the ones matching var
 	 * eif-show -u UUID -v var
 	 * eif-show UUID var
 	 *	print all the env vars that match UUID and var
 	 */
 	/* NB: We assume EFI_GUID is the same as uuid_t */
 	int		aflag = 0, gflag = 0, lflag = 0, vflag = 0;
 	int		ch, rv;
 	unsigned	i;
 	EFI_STATUS	status;
 	EFI_GUID	varguid = { 0,0,0,{0,0,0,0,0,0,0,0} };
 	EFI_GUID	matchguid = { 0,0,0,{0,0,0,0,0,0,0,0} };
 	uint32_t	uuid_status;
 	CHAR16		*varname;
 	CHAR16		*newnm;
 	CHAR16		varnamearg[128];
 	UINTN		varalloc;
 	UINTN		varsz;
 
 	while ((ch = getopt(argc, argv, "ag:lv:")) != -1) {
 		switch (ch) {
 		case 'a':
 			aflag = 1;
 			break;
 		case 'g':
 			gflag = 1;
 			uuid_from_string(optarg, (uuid_t *)&matchguid,
 			    &uuid_status);
 			if (uuid_status != uuid_s_ok) {
 				printf("uid %s could not be parsed\n", optarg);
 				return (CMD_ERROR);
 			}
 			break;
 		case 'l':
 			lflag = 1;
 			break;
 		case 'v':
 			vflag = 1;
 			if (strlen(optarg) >= nitems(varnamearg)) {
 				printf("Variable %s is longer than %zd characters\n",
 				    optarg, nitems(varnamearg));
 				return (CMD_ERROR);
 			}
 			for (i = 0; i < strlen(optarg); i++)
 				varnamearg[i] = optarg[i];
 			varnamearg[i] = 0;
 			break;
 		default:
 			printf("Invalid argument %c\n", ch);
 			return (CMD_ERROR);
 		}
 	}
 
 	if (aflag && (gflag || vflag)) {
 		printf("-a isn't compatible with -v or -u\n");
 		return (CMD_ERROR);
 	}
 
 	if (aflag && optind < argc) {
 		printf("-a doesn't take any args");
 		return (CMD_ERROR);
 	}
 
 	if (optind == argc)
 		aflag = 1;
 
 	argc -= optind;
 	argv += optind;
 
 	pager_open();
 	if (vflag && gflag) {
 		rv = efi_print_var(varnamearg, &matchguid, lflag);
 		pager_close();
 		return (rv);
 	}
 
 	if (argc == 2) {
 		optarg = argv[0];
 		if (strlen(optarg) >= nitems(varnamearg)) {
 			printf("Variable %s is longer than %zd characters\n",
 			    optarg, nitems(varnamearg));
 			pager_close();
 			return (CMD_ERROR);
 		}
 		for (i = 0; i < strlen(optarg); i++)
 			varnamearg[i] = optarg[i];
 		varnamearg[i] = 0;
 		optarg = argv[1];
 		uuid_from_string(optarg, (uuid_t *)&matchguid,
 		    &uuid_status);
 		if (uuid_status != uuid_s_ok) {
 			printf("uid %s could not be parsed\n", optarg);
 			pager_close();
 			return (CMD_ERROR);
 		}
 		rv = efi_print_var(varnamearg, &matchguid, lflag);
 		pager_close();
 		return (rv);
 	}
 
-	if (argc != 0) {
-		printf("Too many args\n");
+	if (argc > 0) {
+		printf("Too many args %d\n", argc);
 		pager_close();
 		return (CMD_ERROR);
 	}
 
 	/*
 	 * Initiate the search -- note the standard takes pain
 	 * to specify the initial call must be a poiner to a NULL
 	 * character.
 	 */
 	varalloc = 1024;
 	varname = malloc(varalloc);
 	if (varname == NULL) {
 		printf("Can't allocate memory to get variables\n");
 		pager_close();
 		return (CMD_ERROR);
 	}
 	varname[0] = 0;
 	while (1) {
 		varsz = varalloc;
 		status = RS->GetNextVariableName(&varsz, varname, &varguid);
 		if (status == EFI_BUFFER_TOO_SMALL) {
 			varalloc = varsz;
 			newnm = malloc(varalloc);
 			if (newnm == NULL) {
 				printf("Can't allocate memory to get variables\n");
 				free(varname);
 				pager_close();
 				return (CMD_ERROR);
 			}
 			memcpy(newnm, varname, varsz);
 			free(varname);
 			varname = newnm;
 			continue; /* Try again with bigger buffer */
 		}
 		if (status != EFI_SUCCESS)
 			break;
 		if (aflag) {
 			if (efi_print_var(varname, &varguid, lflag) != CMD_OK)
 				break;
 			continue;
 		}
 		if (vflag) {
 			if (wcscmp(varnamearg, varname) == 0) {
 				if (efi_print_var(varname, &varguid, lflag) != CMD_OK)
 					break;
 				continue;
 			}
 		}
 		if (gflag) {
 			if (memcmp(&varguid, &matchguid, sizeof(varguid)) == 0) {
 				if (efi_print_var(varname, &varguid, lflag) != CMD_OK)
 					break;
 				continue;
 			}
 		}
 	}
 	free(varname);
 	pager_close();
 
 	return (CMD_OK);
 }
 
 COMMAND_SET(efiset, "efi-set", "set EFI variables", command_efi_set);
 
 static int
 command_efi_set(int argc, char *argv[])
 {
 	char *uuid, *var, *val;
 	CHAR16 wvar[128];
 	EFI_GUID guid;
 	uint32_t status;
 	EFI_STATUS err;
 
 	if (argc != 4) {
 		printf("efi-set uuid var new-value\n");
 		return (CMD_ERROR);
 	}
 	uuid = argv[1];
 	var = argv[2];
 	val = argv[3];
 	uuid_from_string(uuid, (uuid_t *)&guid, &status);
 	if (status != uuid_s_ok) {
 		printf("Invalid uuid %s %d\n", uuid, status);
 		return (CMD_ERROR);
 	}
 	cpy8to16(var, wvar, sizeof(wvar));
 	err = RS->SetVariable(wvar, &guid,
 	    EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_RUNTIME_ACCESS | EFI_VARIABLE_BOOTSERVICE_ACCESS,
 	    strlen(val) + 1, val);
 	if (EFI_ERROR(err)) {
 		printf("Failed to set variable: error %lu\n", EFI_ERROR_CODE(err));
 		return (CMD_ERROR);
 	}
 	return (CMD_OK);
 }
 
 COMMAND_SET(efiunset, "efi-unset", "delete / unset EFI variables", command_efi_unset);
 
 static int
 command_efi_unset(int argc, char *argv[])
 {
 	char *uuid, *var;
 	CHAR16 wvar[128];
 	EFI_GUID guid;
 	uint32_t status;
 	EFI_STATUS err;
 
 	if (argc != 3) {
 		printf("efi-unset uuid var\n");
 		return (CMD_ERROR);
 	}
 	uuid = argv[1];
 	var = argv[2];
 	uuid_from_string(uuid, (uuid_t *)&guid, &status);
 	if (status != uuid_s_ok) {
 		printf("Invalid uuid %s\n", uuid);
 		return (CMD_ERROR);
 	}
 	cpy8to16(var, wvar, sizeof(wvar));
 	err = RS->SetVariable(wvar, &guid, 0, 0, NULL);
 	if (EFI_ERROR(err)) {
 		printf("Failed to unset variable: error %lu\n", EFI_ERROR_CODE(err));
 		return (CMD_ERROR);
 	}
 	return (CMD_OK);
 }
 
 #ifdef LOADER_FDT_SUPPORT
 extern int command_fdt_internal(int argc, char *argv[]);
 
 /*
  * Since proper fdt command handling function is defined in fdt_loader_cmd.c,
  * and declaring it as extern is in contradiction with COMMAND_SET() macro
  * (which uses static pointer), we're defining wrapper function, which
  * calls the proper fdt handling routine.
  */
 static int
 command_fdt(int argc, char *argv[])
 {
 
 	return (command_fdt_internal(argc, argv));
 }
 
 COMMAND_SET(fdt, "fdt", "flattened device tree handling", command_fdt);
 #endif
 
 #ifdef EFI_ZFS_BOOT
 static void
 efi_zfs_probe(void)
 {
 	EFI_HANDLE h;
 	u_int unit;
 	int i;
 	char dname[SPECNAMELEN + 1];
 	uint64_t guid;
 
 	unit = 0;
 	h = efi_find_handle(&efipart_dev, 0);
 	for (i = 0; h != NULL; h = efi_find_handle(&efipart_dev, ++i)) {
 		snprintf(dname, sizeof(dname), "%s%d:", efipart_dev.dv_name, i);
 		if (zfs_probe_dev(dname, &guid) == 0)
 			(void)efi_handle_update_dev(h, &zfs_dev, unit++, guid);
 	}
 }
 #endif
Index: user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/arm/olimex-a20-som-evb.dts
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/arm/olimex-a20-som-evb.dts	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/arm/olimex-a20-som-evb.dts	(revision 307896)
@@ -1,43 +1,47 @@
 /*-
  * Copyright (c) 2015 Emmanuel Vadot <manu@bidouilliste.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "sun7i-a20-olimex-som-evb.dts"
 #include "sun7i-a20-hdmi.dtsi"
 #include "xpowers-axp209.dtsi"
 
 / {
 	soc@01c00000 {
 		hdmi@01c16000 {
 			status = "okay";
 		};
 
 		hdmiaudio {
 			status = "okay";
 		};
 	};
 };
+
+&cpu0 {
+	cpu-supply = <&reg_dcdc2>;
+};
Index: user/alc/PQ_LAUNDRY/sys/boot/ficl/efi.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/ficl/efi.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/boot/ficl/efi.c	(nonexistent)
@@ -1,207 +0,0 @@
-/*-
- * Copyright (c) 2014 Netflix, Inc
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	$FreeBSD$
- */
-
-/*******************************************************************
-** e f i . c
-** Additional words for EFI
-** 
-*******************************************************************/
-
-#ifdef TESTMAIN
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <dirent.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#else
-#include <stand.h>
-#endif
-#include "bootstrap.h"
-#include <string.h>
-#include "ficl.h"
-
-/*
- *		FreeBSD's loader interaction words and extras
- *
- * 		efi-setenv  ( value n name n guid n attr -- 0 | -1)
- * 		efi-getenv  ( guid n addr n -- addr' n' | -1 )
- * 		efi-unsetenv ( name n guid n'' -- )
- */
-
-/*
- * efi-setenv
- * 		efi-setenv  ( value n name n guid n attr -- 0 | -1)
- *
- * Set environment variables using the SetVariable EFI runtime service.
- *
- * Value and guid are passed through in binary form (so guid needs to be
- * converted to binary form from its string form). Name is converted from
- * ASCII to CHAR16. Since ficl doesn't have support for internationalization,
- * there's no native CHAR16 interface provided.
- *
- * attr is an int in the bitmask of the following attributes for this variable.
- *
- *	1	Non volatile
- *	2	Boot service access
- *	4	Run time access
- * (corresponding to the same bits in the UEFI spec).
- */
-void
-ficlEfiSetenv(FICL_VM *pVM)
-{
-#ifndef TESTMAIN
-	char	*value, *guid;
-	CHAR16	*name
-	int	i;
-#endif
-	char	*namep, *valuep, *guidp;
-	int	names, values, guids, attr;
-
-#if FICL_ROBUST > 1
-	vmCheckStack(pVM, 6, 0);
-#endif
-	attr = stackPopINT(pVM->pStack);
-	guids = stackPopINT(pVM->pStack);
-	guidp = (char*)stackPopPtr(pVM->pStack);
-	names = stackPopINT(pVM->pStack);
-	namep = (char*)stackPopPtr(pVM->pStack);
-	values = stackPopINT(pVM->pStack);
-	valuep = (char*)stackPopPtr(pVM->pStack);
-
-#ifndef TESTMAIN
-	guid = (char*)ficlMalloc(guids);
-	if (guid != NULL)
-		vmThrowErr(pVM, "Error: out of memory");
-	memcpy(guid, guidp, guids);
-
-	name = (char*)ficlMalloc((names + 1) * sizeof(CHAR16));
-	if (name == NULL)
-		vmThrowErr(pVM, "Error: out of memory");
-	for (i = 0; i < names; i++)
-		name[i] = namep[i];
-	name[names] = (CHAR16)0;
-
-	value = (char*)ficlMalloc(values + 1);
-	if (value != NULL)
-		vmThrowErr(pVM, "Error: out of memory");
-	memcpy(value, valuep, values);
-
-	status = efi_set_variable(name, guid, attr, value);
-	if (status == EFI_SUCCESS)
-		stackPushINT(pVM->pStack, 0);
-	else
-		stackPushINT(pVM->pStack, -1);
-
-	ficlFree(name);
-	ficlFree(value);
-	ficlFree(guid);
-#endif
-
-	return;
-}
-
-void
-ficlEfiGetenv(FICL_VM *pVM)
-{
-#ifndef TESTMAIN
-	char	*name, *value;
-#endif
-	char	*namep;
-	int	names;
-
-#if FICL_ROBUST > 1
-	vmCheckStack(pVM, 2, 2);
-#endif
-	names = stackPopINT(pVM->pStack);
-	namep = (char*) stackPopPtr(pVM->pStack);
-
-#ifndef TESTMAIN
-	name = (char*) ficlMalloc(names+1);
-	if (!name)
-		vmThrowErr(pVM, "Error: out of memory");
-	strncpy(name, namep, names);
-	name[names] = '\0';
-
-	value = getenv(name);
-	ficlFree(name);
-
-	if(value != NULL) {
-		stackPushPtr(pVM->pStack, value);
-		stackPushINT(pVM->pStack, strlen(value));
-	} else
-#endif
-		stackPushINT(pVM->pStack, -1);
-
-	return;
-}
-
-void
-ficlEfiUnsetenv(FICL_VM *pVM)
-{
-#ifndef TESTMAIN
-	char	*name;
-#endif
-	char	*namep;
-	int	names;
-
-#if FICL_ROBUST > 1
-	vmCheckStack(pVM, 2, 0);
-#endif
-	names = stackPopINT(pVM->pStack);
-	namep = (char*) stackPopPtr(pVM->pStack);
-
-#ifndef TESTMAIN
-	name = (char*) ficlMalloc(names+1);
-	if (!name)
-		vmThrowErr(pVM, "Error: out of memory");
-	strncpy(name, namep, names);
-	name[names] = '\0';
-
-	unsetenv(name);
-	ficlFree(name);
-#endif
-
-	return;
-}
-/**************************************************************************
-
-** Build FreeBSD platform extensions into the system dictionary
-**************************************************************************/
-void ficlEfiCompilePlatform(FICL_SYSTEM *pSys)
-{
-    FICL_DICT *dp = pSys->dp;
-    assert (dp);
-
-    dictAppendWord(dp, "efi-setenv",    ficlEfiSetenv,	    FW_DEFAULT);
-    dictAppendWord(dp, "efi-getenv",    ficlEfiGetenv,	    FW_DEFAULT);
-    dictAppendWord(dp, "efi-unsetenv",  ficlEfiUnsetenv,    FW_DEFAULT);
-
-    return;
-}

Property changes on: user/alc/PQ_LAUNDRY/sys/boot/ficl/efi.c
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: user/alc/PQ_LAUNDRY/sys/boot/ficl/loader.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/ficl/loader.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/boot/ficl/loader.c	(revision 307896)
@@ -1,842 +1,841 @@
 /*-
  * Copyright (c) 2000 Daniel Capo Sobral
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$FreeBSD$
  */
 
 /*******************************************************************
 ** l o a d e r . c
 ** Additional FICL words designed for FreeBSD's loader
 ** 
 *******************************************************************/
 
 #ifdef TESTMAIN
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <dirent.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #else
 #include <stand.h>
 #endif
 #include "bootstrap.h"
 #include <string.h>
 #include <uuid.h>
 #include "ficl.h"
 
 /*		FreeBSD's loader interaction words and extras
  *
  * 		setenv      ( value n name n' -- )
  * 		setenv?     ( value n name n' flag -- )
  * 		getenv      ( addr n -- addr' n' | -1 )
  * 		unsetenv    ( addr n -- )
  * 		copyin      ( addr addr' len -- )
  * 		copyout     ( addr addr' len -- )
  * 		findfile    ( name len type len' -- addr )
  * 		pnpdevices  ( -- addr )
  * 		pnphandlers ( -- addr )
  * 		ccall       ( [[...[p10] p9] ... p1] n addr -- result )
  *		uuid-from-string ( addr n -- addr' )
  *		uuid-to-string ( addr' -- addr n )
  * 		.#	    ( value -- )
  */
 
 void
 ficlSetenv(FICL_VM *pVM)
 {
 #ifndef TESTMAIN
 	char	*name, *value;
 #endif
 	char	*namep, *valuep;
 	int	names, values;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 4, 0);
 #endif
 	names = stackPopINT(pVM->pStack);
 	namep = (char*) stackPopPtr(pVM->pStack);
 	values = stackPopINT(pVM->pStack);
 	valuep = (char*) stackPopPtr(pVM->pStack);
 
 #ifndef TESTMAIN
 	name = (char*) ficlMalloc(names+1);
 	if (!name)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(name, namep, names);
 	name[names] = '\0';
 	value = (char*) ficlMalloc(values+1);
 	if (!value)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(value, valuep, values);
 	value[values] = '\0';
 
 	setenv(name, value, 1);
 	ficlFree(name);
 	ficlFree(value);
 #endif
 
 	return;
 }
 
 void
 ficlSetenvq(FICL_VM *pVM)
 {
 #ifndef TESTMAIN
 	char	*name, *value;
 #endif
 	char	*namep, *valuep;
 	int	names, values, overwrite;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 5, 0);
 #endif
 	overwrite = stackPopINT(pVM->pStack);
 	names = stackPopINT(pVM->pStack);
 	namep = (char*) stackPopPtr(pVM->pStack);
 	values = stackPopINT(pVM->pStack);
 	valuep = (char*) stackPopPtr(pVM->pStack);
 
 #ifndef TESTMAIN
 	name = (char*) ficlMalloc(names+1);
 	if (!name)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(name, namep, names);
 	name[names] = '\0';
 	value = (char*) ficlMalloc(values+1);
 	if (!value)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(value, valuep, values);
 	value[values] = '\0';
 
 	setenv(name, value, overwrite);
 	ficlFree(name);
 	ficlFree(value);
 #endif
 
 	return;
 }
 
 void
 ficlGetenv(FICL_VM *pVM)
 {
 #ifndef TESTMAIN
 	char	*name, *value;
 #endif
 	char	*namep;
 	int	names;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 2, 2);
 #endif
 	names = stackPopINT(pVM->pStack);
 	namep = (char*) stackPopPtr(pVM->pStack);
 
 #ifndef TESTMAIN
 	name = (char*) ficlMalloc(names+1);
 	if (!name)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(name, namep, names);
 	name[names] = '\0';
 
 	value = getenv(name);
 	ficlFree(name);
 
 	if(value != NULL) {
 		stackPushPtr(pVM->pStack, value);
 		stackPushINT(pVM->pStack, strlen(value));
 	} else
 #endif
 		stackPushINT(pVM->pStack, -1);
 
 	return;
 }
 
 void
 ficlUnsetenv(FICL_VM *pVM)
 {
 #ifndef TESTMAIN
 	char	*name;
 #endif
 	char	*namep;
 	int	names;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 2, 0);
 #endif
 	names = stackPopINT(pVM->pStack);
 	namep = (char*) stackPopPtr(pVM->pStack);
 
 #ifndef TESTMAIN
 	name = (char*) ficlMalloc(names+1);
 	if (!name)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(name, namep, names);
 	name[names] = '\0';
 
 	unsetenv(name);
 	ficlFree(name);
 #endif
 
 	return;
 }
 
 void
 ficlCopyin(FICL_VM *pVM)
 {
 	void*		src;
 	vm_offset_t	dest;
 	size_t		len;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 3, 0);
 #endif
 
 	len = stackPopINT(pVM->pStack);
 	dest = stackPopINT(pVM->pStack);
 	src = stackPopPtr(pVM->pStack);
 
 #ifndef TESTMAIN
 	archsw.arch_copyin(src, dest, len);
 #endif
 
 	return;
 }
 
 void
 ficlCopyout(FICL_VM *pVM)
 {
 	void*		dest;
 	vm_offset_t	src;
 	size_t		len;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 3, 0);
 #endif
 
 	len = stackPopINT(pVM->pStack);
 	dest = stackPopPtr(pVM->pStack);
 	src = stackPopINT(pVM->pStack);
 
 #ifndef TESTMAIN
 	archsw.arch_copyout(src, dest, len);
 #endif
 
 	return;
 }
 
 void
 ficlFindfile(FICL_VM *pVM)
 {
 #ifndef TESTMAIN
 	char	*name, *type;
 #endif
 	char	*namep, *typep;
 	struct	preloaded_file* fp;
 	int	names, types;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 4, 1);
 #endif
 
 	types = stackPopINT(pVM->pStack);
 	typep = (char*) stackPopPtr(pVM->pStack);
 	names = stackPopINT(pVM->pStack);
 	namep = (char*) stackPopPtr(pVM->pStack);
 #ifndef TESTMAIN
 	name = (char*) ficlMalloc(names+1);
 	if (!name)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(name, namep, names);
 	name[names] = '\0';
 	type = (char*) ficlMalloc(types+1);
 	if (!type)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(type, typep, types);
 	type[types] = '\0';
 
 	fp = file_findfile(name, type);
 #else
 	fp = NULL;
 #endif
 	stackPushPtr(pVM->pStack, fp);
 
 	return;
 }
 
 void
 ficlCcall(FICL_VM *pVM)
 {
 	int (*func)(int, ...);
 	int result, p[10];
 	int nparam, i;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 2, 0);
 #endif
 
 	func = stackPopPtr(pVM->pStack);
 	nparam = stackPopINT(pVM->pStack);
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, nparam, 1);
 #endif
 
 	for (i = 0; i < nparam; i++)
 		p[i] = stackPopINT(pVM->pStack);
 
 	result = func(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8],
 	    p[9]);
 
 	stackPushINT(pVM->pStack, result);
 
 	return;
 }
 
 void
 ficlUuidFromString(FICL_VM *pVM)
 {
 #ifndef	TESTMAIN
 	char	*uuid;
 	uint32_t status;
 #endif
 	char	*uuidp;
 	int	uuids;
 	uuid_t	*u;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 2, 0);
 #endif
 
 	uuids = stackPopINT(pVM->pStack);
 	uuidp = (char *) stackPopPtr(pVM->pStack);
 
 #ifndef	TESTMAIN
 	uuid = (char *)ficlMalloc(uuids + 1);
 	if (!uuid)
 		vmThrowErr(pVM, "Error: out of memory");
 	strncpy(uuid, uuidp, uuids);
 	uuid[uuids] = '\0';
 
 	u = (uuid_t *)ficlMalloc(sizeof (*u));
 
 	uuid_from_string(uuid, u, &status);
 	ficlFree(uuid);
 	if (status != uuid_s_ok) {
 		ficlFree(u);
 		u = NULL;
 	}
 #else
 	u = NULL;
 #endif
 	stackPushPtr(pVM->pStack, u);
 
 
 	return;
 }
 
 void
 ficlUuidToString(FICL_VM *pVM)
 {
 #ifndef	TESTMAIN
 	char	*uuid;
 	uint32_t status;
 #endif
 	uuid_t	*u;
 
 #if FICL_ROBUST > 1
 	vmCheckStack(pVM, 1, 0);
 #endif
 
 	u = (uuid_t *)stackPopPtr(pVM->pStack);
 
 #ifndef	TESTMAIN
 	uuid_to_string(u, &uuid, &status);
 	if (status != uuid_s_ok) {
 		stackPushPtr(pVM->pStack, uuid);
 		stackPushINT(pVM->pStack, strlen(uuid));
 	} else
 #endif
 		stackPushINT(pVM->pStack, -1);
 
 	return;
 }
 
 /**************************************************************************
                         f i c l E x e c F D
 ** reads in text from file fd and passes it to ficlExec()
  * returns VM_OUTOFTEXT on success or the ficlExec() error code on
  * failure.
  */ 
 #define nLINEBUF 256
 int ficlExecFD(FICL_VM *pVM, int fd)
 {
     char    cp[nLINEBUF];
     int     nLine = 0, rval = VM_OUTOFTEXT;
     char    ch;
     CELL    id;
 
     id = pVM->sourceID;
     pVM->sourceID.i = fd;
 
     /* feed each line to ficlExec */
     while (1) {
 	int status, i;
 
 	i = 0;
 	while ((status = read(fd, &ch, 1)) > 0 && ch != '\n')
 	    cp[i++] = ch;
         nLine++;
 	if (!i) {
 	    if (status < 1)
 		break;
 	    continue;
 	}
         rval = ficlExecC(pVM, cp, i);
 	if(rval != VM_QUIT && rval != VM_USEREXIT && rval != VM_OUTOFTEXT)
         {
             pVM->sourceID = id;
             return rval; 
         }
     }
     /*
     ** Pass an empty line with SOURCE-ID == -1 to flush
     ** any pending REFILLs (as required by FILE wordset)
     */
     pVM->sourceID.i = -1;
     ficlExec(pVM, "");
 
     pVM->sourceID = id;
     return rval;
 }
 
 static void displayCellNoPad(FICL_VM *pVM)
 {
     CELL c;
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 1, 0);
 #endif
     c = stackPop(pVM->pStack);
     ltoa((c).i, pVM->pad, pVM->base);
     vmTextOut(pVM, pVM->pad, 0);
     return;
 }
 
 /*      isdir? - Return whether an fd corresponds to a directory.
  *
  * isdir? ( fd -- bool )
  */
 static void isdirQuestion(FICL_VM *pVM)
 {
     struct stat sb;
     FICL_INT flag;
     int fd;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 1, 1);
 #endif
 
     fd = stackPopINT(pVM->pStack);
     flag = FICL_FALSE;
     do {
         if (fd < 0)
             break;
         if (fstat(fd, &sb) < 0)
             break;
         if (!S_ISDIR(sb.st_mode))
             break;
         flag = FICL_TRUE;
     } while (0);
     stackPushINT(pVM->pStack, flag);
 }
 
 /*          fopen - open a file and return new fd on stack.
  *
  * fopen ( ptr count mode -- fd )
  */
 static void pfopen(FICL_VM *pVM)
 {
     int     mode, fd, count;
     char    *ptr, *name;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 3, 1);
 #endif
 
     mode = stackPopINT(pVM->pStack);    /* get mode */
     count = stackPopINT(pVM->pStack);   /* get count */
     ptr = stackPopPtr(pVM->pStack);     /* get ptr */
 
     if ((count < 0) || (ptr == NULL)) {
         stackPushINT(pVM->pStack, -1);
         return;
     }
 
     /* ensure that the string is null terminated */
     name = (char *)malloc(count+1);
     bcopy(ptr,name,count);
     name[count] = 0;
 
     /* open the file */
     fd = open(name, mode);
     free(name);
     stackPushINT(pVM->pStack, fd);
     return;
 }
  
 /*          fclose - close a file who's fd is on stack.
  *
  * fclose ( fd -- )
  */
 static void pfclose(FICL_VM *pVM)
 {
     int fd;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 1, 0);
 #endif
     fd = stackPopINT(pVM->pStack); /* get fd */
     if (fd != -1)
 	close(fd);
     return;
 }
 
 /*          fread - read file contents
  *
  * fread  ( fd buf nbytes  -- nread )
  */
 static void pfread(FICL_VM *pVM)
 {
     int     fd, len;
     char *buf;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 3, 1);
 #endif
     len = stackPopINT(pVM->pStack); /* get number of bytes to read */
     buf = stackPopPtr(pVM->pStack); /* get buffer */
     fd = stackPopINT(pVM->pStack); /* get fd */
     if (len > 0 && buf && fd != -1)
 	stackPushINT(pVM->pStack, read(fd, buf, len));
     else
 	stackPushINT(pVM->pStack, -1);
     return;
 }
 
 /*      freaddir - read directory contents
  *
  * freaddir ( fd -- ptr len TRUE | FALSE )
  */
 static void pfreaddir(FICL_VM *pVM)
 {
 #ifdef TESTMAIN
     static struct dirent dirent;
     struct stat sb;
     char *buf;
     off_t off, ptr;
     u_int blksz;
     int bufsz;
 #endif
     struct dirent *d;
     int fd;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 1, 3);
 #endif
 
     fd = stackPopINT(pVM->pStack);
 #if TESTMAIN
     /*
      * The readdirfd() function is specific to the loader environment.
      * We do the best we can to make freaddir work, but it's not at
      * all guaranteed.
      */
     d = NULL;
     buf = NULL;
     do {
 	if (fd == -1)
 	    break;
 	if (fstat(fd, &sb) == -1)
 	    break;
 	blksz = (sb.st_blksize) ? sb.st_blksize : getpagesize();
 	if ((blksz & (blksz - 1)) != 0)
 	    break;
 	buf = malloc(blksz);
 	if (buf == NULL)
 	    break;
 	off = lseek(fd, 0LL, SEEK_CUR);
 	if (off == -1)
 	    break;
 	ptr = off;
 	if (lseek(fd, 0, SEEK_SET) == -1)
 	    break;
 	bufsz = getdents(fd, buf, blksz);
 	while (bufsz > 0 && bufsz <= ptr) {
 	    ptr -= bufsz;
 	    bufsz = getdents(fd, buf, blksz);
 	}
 	if (bufsz <= 0)
 	    break;
 	d = (void *)(buf + ptr);
 	dirent = *d;
 	off += d->d_reclen;
 	d = (lseek(fd, off, SEEK_SET) != off) ? NULL : &dirent;
     } while (0);
     if (buf != NULL)
 	free(buf);
 #else
     d = readdirfd(fd);
 #endif
     if (d != NULL) {
         stackPushPtr(pVM->pStack, d->d_name);
         stackPushINT(pVM->pStack, strlen(d->d_name));
         stackPushINT(pVM->pStack, FICL_TRUE);
     } else {
         stackPushINT(pVM->pStack, FICL_FALSE);
     }
 }
 
 /*          fload - interpret file contents
  *
  * fload  ( fd -- )
  */
 static void pfload(FICL_VM *pVM)
 {
     int     fd;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 1, 0);
 #endif
     fd = stackPopINT(pVM->pStack); /* get fd */
     if (fd != -1)
 	ficlExecFD(pVM, fd);
     return;
 }
 
 /*          fwrite - write file contents
  *
  * fwrite  ( fd buf nbytes  -- nwritten )
  */
 static void pfwrite(FICL_VM *pVM)
 {
     int     fd, len;
     char *buf;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 3, 1);
 #endif
     len = stackPopINT(pVM->pStack); /* get number of bytes to read */
     buf = stackPopPtr(pVM->pStack); /* get buffer */
     fd = stackPopINT(pVM->pStack); /* get fd */
     if (len > 0 && buf && fd != -1)
 	stackPushINT(pVM->pStack, write(fd, buf, len));
     else
 	stackPushINT(pVM->pStack, -1);
     return;
 }
 
 /*          fseek - seek to a new position in a file
  *
  * fseek  ( fd ofs whence  -- pos )
  */
 static void pfseek(FICL_VM *pVM)
 {
     int     fd, pos, whence;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 3, 1);
 #endif
     whence = stackPopINT(pVM->pStack);
     pos = stackPopINT(pVM->pStack);
     fd = stackPopINT(pVM->pStack);
     stackPushINT(pVM->pStack, lseek(fd, pos, whence));
     return;
 }
 
 /*           key - get a character from stdin
  *
  * key ( -- char )
  */
 static void key(FICL_VM *pVM)
 {
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 0, 1);
 #endif
     stackPushINT(pVM->pStack, getchar());
     return;
 }
 
 /*           key? - check for a character from stdin (FACILITY)
  *
  * key? ( -- flag )
  */
 static void keyQuestion(FICL_VM *pVM)
 {
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 0, 1);
 #endif
 #ifdef TESTMAIN
     /* XXX Since we don't fiddle with termios, let it always succeed... */
     stackPushINT(pVM->pStack, FICL_TRUE);
 #else
     /* But here do the right thing. */
     stackPushINT(pVM->pStack, ischar()? FICL_TRUE : FICL_FALSE);
 #endif
     return;
 }
 
 /* seconds - gives number of seconds since beginning of time
  *
  * beginning of time is defined as:
  *
  *	BTX	- number of seconds since midnight
  *	FreeBSD	- number of seconds since Jan 1 1970
  *
  * seconds ( -- u )
  */
 static void pseconds(FICL_VM *pVM)
 {
 #if FICL_ROBUST > 1
     vmCheckStack(pVM,0,1);
 #endif
     stackPushUNS(pVM->pStack, (FICL_UNS) time(NULL));
     return;
 }
 
 /* ms - wait at least that many milliseconds (FACILITY)
  *
  * ms ( u -- )
  *
  */
 static void ms(FICL_VM *pVM)
 {
 #if FICL_ROBUST > 1
     vmCheckStack(pVM,1,0);
 #endif
 #ifdef TESTMAIN
     usleep(stackPopUNS(pVM->pStack)*1000);
 #else
     delay(stackPopUNS(pVM->pStack)*1000);
 #endif
     return;
 }
 
 /*           fkey - get a character from a file
  *
  * fkey ( file -- char )
  */
 static void fkey(FICL_VM *pVM)
 {
     int i, fd;
     char ch;
 
 #if FICL_ROBUST > 1
     vmCheckStack(pVM, 1, 1);
 #endif
     fd = stackPopINT(pVM->pStack);
     i = read(fd, &ch, 1);
     stackPushINT(pVM->pStack, i > 0 ? ch : -1);
     return;
 }
 
 
 /*
 ** Retrieves free space remaining on the dictionary
 */
 
 static void freeHeap(FICL_VM *pVM)
 {
     stackPushINT(pVM->pStack, dictCellsAvail(ficlGetDict(pVM->pSys)));
 }
 
 
 /******************* Increase dictionary size on-demand ******************/
  
 static void ficlDictThreshold(FICL_VM *pVM)
 {
     stackPushPtr(pVM->pStack, &dictThreshold);
 }
  
 static void ficlDictIncrease(FICL_VM *pVM)
 {
     stackPushPtr(pVM->pStack, &dictIncrease);
 }
 
 /**************************************************************************
                         f i c l C o m p i l e P l a t f o r m
 ** Build FreeBSD platform extensions into the system dictionary
 **************************************************************************/
 void ficlCompilePlatform(FICL_SYSTEM *pSys)
 {
     ficlCompileFcn **fnpp;
     FICL_DICT *dp = pSys->dp;
     assert (dp);
 
     dictAppendWord(dp, ".#",        displayCellNoPad,    FW_DEFAULT);
     dictAppendWord(dp, "isdir?",    isdirQuestion,  FW_DEFAULT);
     dictAppendWord(dp, "fopen",	    pfopen,	    FW_DEFAULT);
     dictAppendWord(dp, "fclose",    pfclose,	    FW_DEFAULT);
     dictAppendWord(dp, "fread",	    pfread,	    FW_DEFAULT);
     dictAppendWord(dp, "freaddir",  pfreaddir,	    FW_DEFAULT);
     dictAppendWord(dp, "fload",	    pfload,	    FW_DEFAULT);
     dictAppendWord(dp, "fkey",	    fkey,	    FW_DEFAULT);
     dictAppendWord(dp, "fseek",     pfseek,	    FW_DEFAULT);
     dictAppendWord(dp, "fwrite",    pfwrite,	    FW_DEFAULT);
     dictAppendWord(dp, "key",	    key,	    FW_DEFAULT);
     dictAppendWord(dp, "key?",	    keyQuestion,    FW_DEFAULT);
     dictAppendWord(dp, "ms",        ms,             FW_DEFAULT);
     dictAppendWord(dp, "seconds",   pseconds,       FW_DEFAULT);
     dictAppendWord(dp, "heap?",     freeHeap,       FW_DEFAULT);
     dictAppendWord(dp, "dictthreshold", ficlDictThreshold, FW_DEFAULT);
     dictAppendWord(dp, "dictincrease", ficlDictIncrease, FW_DEFAULT);
 
     dictAppendWord(dp, "setenv",    ficlSetenv,	    FW_DEFAULT);
     dictAppendWord(dp, "setenv?",   ficlSetenvq,    FW_DEFAULT);
     dictAppendWord(dp, "getenv",    ficlGetenv,	    FW_DEFAULT);
     dictAppendWord(dp, "unsetenv",  ficlUnsetenv,   FW_DEFAULT);
     dictAppendWord(dp, "copyin",    ficlCopyin,	    FW_DEFAULT);
     dictAppendWord(dp, "copyout",   ficlCopyout,    FW_DEFAULT);
     dictAppendWord(dp, "findfile",  ficlFindfile,   FW_DEFAULT);
     dictAppendWord(dp, "ccall",	    ficlCcall,	    FW_DEFAULT);
     dictAppendWord(dp, "uuid-from-string", ficlUuidFromString, FW_DEFAULT);
     dictAppendWord(dp, "uuid-to-string", ficlUuidToString, FW_DEFAULT);
 
-    SET_FOREACH(fnpp, Xficl_compile_set) {
+    SET_FOREACH(fnpp, Xficl_compile_set)
 	(*fnpp)(pSys);
-    }
 
 #if defined(PC98)
     ficlSetEnv(pSys, "arch-pc98",         FICL_TRUE);
 #elif defined(__i386__)
     ficlSetEnv(pSys, "arch-i386",         FICL_TRUE);
     ficlSetEnv(pSys, "arch-powerpc",      FICL_FALSE);
 #elif defined(__powerpc__)
     ficlSetEnv(pSys, "arch-i386",         FICL_FALSE);
     ficlSetEnv(pSys, "arch-powerpc",      FICL_TRUE);
 #endif
 
     return;
 }
Index: user/alc/PQ_LAUNDRY/sys/boot/forth/Makefile.inc
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/forth/Makefile.inc	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/boot/forth/Makefile.inc	(revision 307896)
@@ -1,25 +1,26 @@
 # $FreeBSD$
 
 FILES+=	beastie.4th
 FILES+=	brand.4th
 FILES+=	brand-fbsd.4th
 FILES+=	check-password.4th
 FILES+=	color.4th
 FILES+=	delay.4th
+FILES+=	efi.4th
 FILES+=	frames.4th
 FILES+=	loader.4th
 FILES+=	loader.conf
 FILES+=	loader.help
 FILES+=	logo-beastie.4th
 FILES+=	logo-beastiebw.4th
 FILES+=	logo-fbsdbw.4th
 FILES+=	logo-orb.4th
 FILES+=	logo-orbbw.4th
 FILES+=	menu.4th
 FILES+=	menu-commands.4th
 FILES+=	menusets.4th
 FILES+=	screen.4th
 FILES+=	shortcuts.4th
 FILES+=	support.4th
 FILES+=	version.4th
 FILESDIR_loader.conf=	/boot/defaults
Index: user/alc/PQ_LAUNDRY/sys/boot/forth/efi.4th
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/forth/efi.4th	(nonexistent)
+++ user/alc/PQ_LAUNDRY/sys/boot/forth/efi.4th	(revision 307896)
@@ -0,0 +1,30 @@
+\ Copyright (c) 2016 Netflix, Inc
+\ All rights reserved.
+\
+\ Redistribution and use in source and binary forms, with or without
+\ modification, are permitted provided that the following conditions
+\ are met:
+\ 1. Redistributions of source code must retain the above copyright
+\    notice, this list of conditions and the following disclaimer.
+\ 2. Redistributions in binary form must reproduce the above copyright
+\    notice, this list of conditions and the following disclaimer in the
+\    documentation and/or other materials provided with the distribution.
+\
+\ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+\ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+\ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+\ ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+\ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+\ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+\ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+\ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+\ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+\ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+\ SUCH DAMAGE.
+\
+\ $FreeBSD$
+
+only forth definitions
+
+\ Place holder for more functions
+.( EFI boot environment) cr

Property changes on: user/alc/PQ_LAUNDRY/sys/boot/forth/efi.4th
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: user/alc/PQ_LAUNDRY/sys/boot/forth/loader.4th
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/forth/loader.4th	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/boot/forth/loader.4th	(revision 307896)
@@ -1,263 +1,266 @@
 \ Copyright (c) 1999 Daniel C. Sobral <dcs@FreeBSD.org>
 \ Copyright (c) 2011-2015 Devin Teske <dteske@FreeBSD.org>
 \ All rights reserved.
 \
 \ Redistribution and use in source and binary forms, with or without
 \ modification, are permitted provided that the following conditions
 \ are met:
 \ 1. Redistributions of source code must retain the above copyright
 \    notice, this list of conditions and the following disclaimer.
 \ 2. Redistributions in binary form must reproduce the above copyright
 \    notice, this list of conditions and the following disclaimer in the
 \    documentation and/or other materials provided with the distribution.
 \
 \ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 \ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 \ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 \ ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 \ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 \ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 \ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 \ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 \ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 \ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 \ SUCH DAMAGE.
 \
 \ $FreeBSD$
 
 only forth definitions
 
 s" arch-i386" environment? [if] [if]
 	s" loader_version" environment?  [if]
 		11 < [if]
 			.( Loader version 1.1+ required) cr
 			abort
 		[then]
 	[else]
 		.( Could not get loader version!) cr
 		abort
 	[then]
 [then] [then]
 
 256 dictthreshold !  \ 256 cells minimum free space
 2048 dictincrease !  \ 2048 additional cells each time
 
 include /boot/support.4th
 include /boot/color.4th
 include /boot/delay.4th
 include /boot/check-password.4th
+s" efi-boot" environment? [if] [if]
+	include /boot/efi.4th
+[then] [then]
 
 only forth definitions
 
 : bootmsg ( -- )
   loader_color? dup ( -- bool bool )
   if 7 fg 4 bg then
   ." Booting..."
   if me then
   cr
 ;
 
 : try-menu-unset
   \ menu-unset may not be present
   s" beastie_disable" getenv
   dup -1 <> if
     s" YES" compare-insensitive 0= if
       exit
     then
   else
     drop
   then
   s" menu-unset"
   sfind if
     execute
   else
     drop
   then
   s" menusets-unset"
   sfind if
     execute
   else
     drop
   then
 ;
 
 only forth also support-functions also builtins definitions
 
 : boot
   0= if ( interpreted ) get_arguments then
 
   \ Unload only if a path was passed
   dup if
     >r over r> swap
     c@ [char] - <> if
       0 1 unload drop
     else
       s" kernelname" getenv? if ( a kernel has been loaded )
         try-menu-unset
         bootmsg 1 boot exit
       then
       load_kernel_and_modules
       ?dup if exit then
       try-menu-unset
       bootmsg 0 1 boot exit
     then
   else
     s" kernelname" getenv? if ( a kernel has been loaded )
       try-menu-unset
       bootmsg 1 boot exit
     then
     load_kernel_and_modules
     ?dup if exit then
     try-menu-unset
     bootmsg 0 1 boot exit
   then
   load_kernel_and_modules
   ?dup 0= if bootmsg 0 1 boot then
 ;
 
 \ ***** boot-conf
 \
 \	Prepares to boot as specified by loaded configuration files.
 
 : boot-conf
   0= if ( interpreted ) get_arguments then
   0 1 unload drop
   load_kernel_and_modules
   ?dup 0= if 0 1 autoboot then
 ;
 
 also forth definitions previous
 
 builtin: boot
 builtin: boot-conf
 
 only forth definitions also support-functions
 
 \ ***** start
 \
 \       Initializes support.4th global variables, sets loader_conf_files,
 \       processes conf files, and, if any one such file was successfully
 \       read to the end, loads kernel and modules.
 
 : start  ( -- ) ( throws: abort & user-defined )
   s" /boot/defaults/loader.conf" initialize
   include_conf_files
   include_nextboot_file
   \ If the user defined a post-initialize hook, call it now
   s" post-initialize" sfind if execute else drop then
   \ Will *NOT* try to load kernel and modules if no configuration file
   \ was successfully loaded!
   any_conf_read? if
     s" loader_delay" getenv -1 = if
       load_xen_throw
       load_kernel
       load_modules
     else
       drop
       ." Loading Kernel and Modules (Ctrl-C to Abort)" cr
       s" also support-functions" evaluate
       s" set delay_command='load_xen_throw load_kernel load_modules'" evaluate
       s" set delay_showdots" evaluate
       delay_execute
     then
   then
 ;
 
 \ ***** initialize
 \
 \	Overrides support.4th initialization word with one that does
 \	everything start one does, short of loading the kernel and
 \	modules. Returns a flag.
 
 : initialize ( -- flag )
   s" /boot/defaults/loader.conf" initialize
   include_conf_files
   include_nextboot_file
   \ If the user defined a post-initialize hook, call it now
   s" post-initialize" sfind if execute else drop then
   any_conf_read?
 ;
 
 \ ***** read-conf
 \
 \	Read a configuration file, whose name was specified on the command
 \	line, if interpreted, or given on the stack, if compiled in.
 
 : (read-conf)  ( addr len -- )
   conf_files string=
   include_conf_files \ Will recurse on new loader_conf_files definitions
 ;
 
 : read-conf  ( <filename> | addr len -- ) ( throws: abort & user-defined )
   state @ if
     \ Compiling
     postpone (read-conf)
   else
     \ Interpreting
     bl parse (read-conf)
   then
 ; immediate
 
 \ show, enable, disable, toggle module loading. They all take module from
 \ the next word
 
 : set-module-flag ( module_addr val -- ) \ set and print flag
   over module.flag !
   dup module.name strtype
   module.flag @ if ."  will be loaded" else ."  will not be loaded" then cr
 ;
 
 : enable-module find-module ?dup if true set-module-flag then ;
 
 : disable-module find-module ?dup if false set-module-flag then ;
 
 : toggle-module find-module ?dup if dup module.flag @ 0= set-module-flag then ;
 
 \ ***** show-module
 \
 \	Show loading information about a module.
 
 : show-module ( <module> -- ) find-module ?dup if show-one-module then ;
 
 \ Words to be used inside configuration files
 
 : retry false ;         \ For use in load error commands
 : ignore true ;         \ For use in load error commands
 
 \ Return to strict forth vocabulary
 
 : #type
   over - >r
   type
   r> spaces
 ;
 
 : .? 2 spaces 2swap 15 #type 2 spaces type cr ;
 
 \ Execute the ? command to print all the commands defined in
 \ C, then list the ones we support here. Please note that this
 \ doesn't use pager_* routines that the C implementation of ?
 \ does, so these will always appear, even if you stop early
 \ there. And they may cause the commands to scroll off the
 \ screen if the number of commands modulus LINES is close
 \ to LINEs....
 : ?
   ['] ? execute
   s" boot-conf" s" load kernel and modules, then autoboot" .?
   s" read-conf" s" read a configuration file" .?
   s" enable-module" s" enable loading of a module" .?
   s" disable-module" s" disable loading of a module" .?
   s" toggle-module" s" toggle loading of a module" .?
   s" show-module" s" show module load data" .?
   s" try-include" s" try to load/interpret files" .?
 ;
 
 : try-include ( -- ) \ see loader.4th(8)
   ['] include ( -- xt ) \ get the execution token of `include'
   catch ( xt -- exception# | 0 ) if \ failed
     LF parse ( c -- s-addr/u ) 2drop \ advance >in to EOL (drop data)
     \ ... prevents words unused by `include' from being interpreted
   then
 ; immediate \ interpret immediately for access to `source' (aka tib)
 
 only forth definitions
Index: user/alc/PQ_LAUNDRY/sys/contrib/rdma/krping/krping.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/contrib/rdma/krping/krping.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/contrib/rdma/krping/krping.c	(revision 307896)
@@ -1,3347 +1,3347 @@
 /*
  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/in.h>
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/sched.h>
 
 #include <asm/atomic.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 
 #include "krping.h"
 #include "getopt.h"
 
 extern int krping_debug;
 #define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x)
 #define PRINTF(cb, x...) log(LOG_INFO, x)
 #define BIND_INFO 1
 
 MODULE_AUTHOR("Steve Wise");
 MODULE_DESCRIPTION("RDMA ping client/server");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(krping, 1);
 MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
 
 static __inline uint64_t
 get_cycles(void)
 {
 	uint32_t low, high;
 	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
 	return (low | ((u_int64_t)high << 32));
 }
 
 typedef uint64_t cycles_t;
 
 enum mem_type {
 	DMA = 1,
 	FASTREG = 2,
 	MW = 3,
 	MR = 4
 };
 
 static const struct krping_option krping_opts[] = {
 	{"count", OPT_INT, 'C'},
 	{"size", OPT_INT, 'S'},
 	{"addr", OPT_STRING, 'a'},
 	{"port", OPT_INT, 'p'},
 	{"verbose", OPT_NOPARAM, 'v'},
 	{"validate", OPT_NOPARAM, 'V'},
 	{"server", OPT_NOPARAM, 's'},
 	{"client", OPT_NOPARAM, 'c'},
 	{"mem_mode", OPT_STRING, 'm'},
 	{"server_inv", OPT_NOPARAM, 'I'},
  	{"wlat", OPT_NOPARAM, 'l'},
  	{"rlat", OPT_NOPARAM, 'L'},
  	{"bw", OPT_NOPARAM, 'B'},
  	{"duplex", OPT_NOPARAM, 'd'},
  	{"txdepth", OPT_INT, 'T'},
  	{"poll", OPT_NOPARAM, 'P'},
  	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
  	{"read_inv", OPT_NOPARAM, 'R'},
  	{"fr", OPT_INT, 'f'},
 	{NULL, 0, 0}
 };
 
 #define htonll(x) cpu_to_be64((x))
 #define ntohll(x) cpu_to_be64((x))
 
 static struct mutex krping_mutex;
 
 /*
  * List of running krping threads.
  */
 static LIST_HEAD(krping_cbs);
 
 /*
  * krping "ping/pong" loop:
  * 	client sends source rkey/addr/len
  *	server receives source rkey/add/len
  *	server rdma reads "ping" data from source
  * 	server sends "go ahead" on rdma read completion
  *	client sends sink rkey/addr/len
  * 	server receives sink rkey/addr/len
  * 	server rdma writes "pong" data to sink
  * 	server sends "go ahead" on rdma write completion
  * 	<repeat loop>
  */
 
 /*
  * These states are used to signal events between the completion handler
  * and the main client or server thread.
  *
  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
  * and RDMA_WRITE_COMPLETE for each ping.
  */
 enum test_state {
 	IDLE = 1,
 	CONNECT_REQUEST,
 	ADDR_RESOLVED,
 	ROUTE_RESOLVED,
 	CONNECTED,
 	RDMA_READ_ADV,
 	RDMA_READ_COMPLETE,
 	RDMA_WRITE_ADV,
 	RDMA_WRITE_COMPLETE,
 	ERROR
 };
 
 struct krping_rdma_info {
 	uint64_t buf;
 	uint32_t rkey;
 	uint32_t size;
 };
 
 /*
  * Default max buffer size for IO...
  */
 #define RPING_BUFSIZE 128*1024
 #define RPING_SQ_DEPTH 64
 
 /*
  * Control block struct.
  */
 struct krping_cb {
 	void *cookie;
 	int server;			/* 0 iff client */
 	struct ib_cq *cq;
 	struct ib_pd *pd;
 	struct ib_qp *qp;
 
 	enum mem_type mem;
 	struct ib_mr *dma_mr;
 
 	struct ib_fast_reg_page_list *page_list;
 	int page_list_len;
 	struct ib_send_wr fastreg_wr;
 	struct ib_send_wr invalidate_wr;
 	struct ib_mr *fastreg_mr;
 	int server_invalidate;
 	int read_inv;
 	u8 key;
 
 	struct ib_mw *mw;
 	struct ib_mw_bind bind_attr;
 
 	struct ib_recv_wr rq_wr;	/* recv work request record */
 	struct ib_sge recv_sgl;		/* recv single SGE */
 	struct krping_rdma_info recv_buf;/* malloc'd buffer */
 	u64 recv_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
 	struct ib_mr *recv_mr;
 
 	struct ib_send_wr sq_wr;	/* send work requrest record */
 	struct ib_sge send_sgl;
 	struct krping_rdma_info send_buf;/* single send buf */
 	u64 send_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(send_mapping)
 	struct ib_mr *send_mr;
 
 	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
 	struct ib_sge rdma_sgl;		/* rdma single SGE */
 	char *rdma_buf;			/* used as rdma sink */
 	u64  rdma_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
 	struct ib_mr *rdma_mr;
 
 	uint32_t remote_rkey;		/* remote guys RKEY */
 	uint64_t remote_addr;		/* remote guys TO */
 	uint32_t remote_len;		/* remote guys LEN */
 
 	char *start_buf;		/* rdma read src */
 	u64  start_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(start_mapping)
 	struct ib_mr *start_mr;
 
 	enum test_state state;		/* used for cond/signalling */
 	wait_queue_head_t sem;
 	struct krping_stats stats;
 
 	uint16_t port;			/* dst port in NBO */
 	struct in_addr addr;		/* dst addr in NBO */
 	char *addr_str;			/* dst addr string */
 	int verbose;			/* verbose logging */
 	int count;			/* ping count */
 	int size;			/* ping data size */
 	int validate;			/* validate ping data */
 	int wlat;			/* run wlat test */
 	int rlat;			/* run rlat test */
 	int bw;				/* run bw test */
 	int duplex;			/* run bw full duplex test */
 	int poll;			/* poll or block for rlat test */
 	int txdepth;			/* SQ depth */
 	int local_dma_lkey;		/* use 0 for lkey */
 	int frtest;			/* fastreg test */
 	int testnum;
 
 	/* CM stuff */
 	struct rdma_cm_id *cm_id;	/* connection on client side,*/
 					/* listener on server side. */
 	struct rdma_cm_id *child_cm_id;	/* connection on server side */
 	struct list_head list;
 };
 
 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
 				   struct rdma_cm_event *event)
 {
 	int ret;
 	struct krping_cb *cb = cma_id->context;
 
 	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
 	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
 
 	switch (event->event) {
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
 		cb->state = ADDR_RESOLVED;
 		ret = rdma_resolve_route(cma_id, 2000);
 		if (ret) {
 			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
 			wake_up_interruptible(&cb->sem);
 		}
 		break;
 
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
 		cb->state = ROUTE_RESOLVED;
 		cb->child_cm_id = cma_id;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_CONNECT_REQUEST:
 		if (cb->state == IDLE) {
 			cb->state = CONNECT_REQUEST;
 			cb->child_cm_id = cma_id;
 		} else {
 			PRINTF(cb, "Received connection request in wrong state"
 			    " (%d)\n", cb->state);
 		}
 		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
 		DEBUG_LOG(cb, "ESTABLISHED\n");
 		if (!cb->server) {
 			cb->state = CONNECTED;
 		}
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_ADDR_ERROR:
 	case RDMA_CM_EVENT_ROUTE_ERROR:
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 	case RDMA_CM_EVENT_UNREACHABLE:
 	case RDMA_CM_EVENT_REJECTED:
 		PRINTF(cb, "cma event %d, error %d\n", event->event,
 		       event->status);
 		cb->state = ERROR;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_DISCONNECTED:
 		PRINTF(cb, "DISCONNECT EVENT...\n");
 		cb->state = ERROR;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 		PRINTF(cb, "cma detected device removal!!!!\n");
 		break;
 
 	default:
 		PRINTF(cb, "oof bad type!\n");
 		wake_up_interruptible(&cb->sem);
 		break;
 	}
 	return 0;
 }
 
 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
 {
 	if (wc->byte_len != sizeof(cb->recv_buf)) {
 		PRINTF(cb, "Received bogus data, size %d\n", 
 		       wc->byte_len);
 		return -1;
 	}
 
 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
 	cb->remote_addr = ntohll(cb->recv_buf.buf);
 	cb->remote_len  = ntohl(cb->recv_buf.size);
 	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
 		  cb->remote_rkey, (unsigned long long)cb->remote_addr, 
 		  cb->remote_len);
 
 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
 		cb->state = RDMA_READ_ADV;
 	else
 		cb->state = RDMA_WRITE_ADV;
 
 	return 0;
 }
 
 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
 {
 	if (wc->byte_len != sizeof(cb->recv_buf)) {
 		PRINTF(cb, "Received bogus data, size %d\n", 
 		       wc->byte_len);
 		return -1;
 	}
 
 	if (cb->state == RDMA_READ_ADV)
 		cb->state = RDMA_WRITE_ADV;
 	else
 		cb->state = RDMA_WRITE_COMPLETE;
 
 	return 0;
 }
 
 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
 {
 	struct krping_cb *cb = ctx;
 	struct ib_wc wc;
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	BUG_ON(cb->cq != cq);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "cq completion in ERROR state\n");
 		return;
 	}
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest)
 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
 		if (wc.status) {
 			if (wc.status == IB_WC_WR_FLUSH_ERR) {
 				DEBUG_LOG(cb, "cq flushed\n");
 				continue;
 			} else {
 				PRINTF(cb, "cq completion failed with "
 				       "wr_id %jx status %d opcode %d vender_err %x\n",
 					(uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
 				goto error;
 			}
 		}
 
 		switch (wc.opcode) {
 		case IB_WC_SEND:
 			DEBUG_LOG(cb, "send completion\n");
 			cb->stats.send_bytes += cb->send_sgl.length;
 			cb->stats.send_msgs++;
 			break;
 
 		case IB_WC_RDMA_WRITE:
 			DEBUG_LOG(cb, "rdma write completion\n");
 			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
 			cb->stats.write_msgs++;
 			cb->state = RDMA_WRITE_COMPLETE;
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		case IB_WC_RDMA_READ:
 			DEBUG_LOG(cb, "rdma read completion\n");
 			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
 			cb->stats.read_msgs++;
 			cb->state = RDMA_READ_COMPLETE;
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		case IB_WC_RECV:
 			DEBUG_LOG(cb, "recv completion\n");
 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
 			cb->stats.recv_msgs++;
 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest)
 				ret = server_recv(cb, &wc);
 			else
 				ret = cb->server ? server_recv(cb, &wc) :
 						   client_recv(cb, &wc);
 			if (ret) {
 				PRINTF(cb, "recv wc error: %d\n", ret);
 				goto error;
 			}
 
 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 			if (ret) {
 				PRINTF(cb, "post recv error: %d\n", 
 				       ret);
 				goto error;
 			}
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		default:
 			PRINTF(cb, 
 			       "%s:%d Unexpected opcode %d, Shutting down\n",
 			       __func__, __LINE__, wc.opcode);
 			goto error;
 		}
 	}
 	if (ret) {
 		PRINTF(cb, "poll error %d\n", ret);
 		goto error;
 	}
 	return;
 error:
 	cb->state = ERROR;
 	wake_up_interruptible(&cb->sem);
 }
 
 static int krping_accept(struct krping_cb *cb)
 {
 	struct rdma_conn_param conn_param;
 	int ret;
 
 	DEBUG_LOG(cb, "accepting client connection request\n");
 
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 1;
 	conn_param.initiator_depth = 1;
 
 	ret = rdma_accept(cb->child_cm_id, &conn_param);
 	if (ret) {
 		PRINTF(cb, "rdma_accept error: %d\n", ret);
 		return ret;
 	}
 
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
 		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
 		if (cb->state == ERROR) {
 			PRINTF(cb, "wait for CONNECTED state %d\n", 
 				cb->state);
 			return -1;
 		}
 	}
 	return 0;
 }
 
 static void krping_setup_wr(struct krping_cb *cb)
 {
 	cb->recv_sgl.addr = cb->recv_dma_addr;
 	cb->recv_sgl.length = sizeof cb->recv_buf;
 	if (cb->local_dma_lkey)
 		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
 	else if (cb->mem == DMA)
 		cb->recv_sgl.lkey = cb->dma_mr->lkey;
 	else
 		cb->recv_sgl.lkey = cb->recv_mr->lkey;
 	cb->rq_wr.sg_list = &cb->recv_sgl;
 	cb->rq_wr.num_sge = 1;
 
 	cb->send_sgl.addr = cb->send_dma_addr;
 	cb->send_sgl.length = sizeof cb->send_buf;
 	if (cb->local_dma_lkey)
 		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
 	else if (cb->mem == DMA)
 		cb->send_sgl.lkey = cb->dma_mr->lkey;
 	else
 		cb->send_sgl.lkey = cb->send_mr->lkey;
 
 	cb->sq_wr.opcode = IB_WR_SEND;
 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
 	cb->sq_wr.sg_list = &cb->send_sgl;
 	cb->sq_wr.num_sge = 1;
 
 	if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 		cb->rdma_sgl.addr = cb->rdma_dma_addr;
 		if (cb->mem == MR)
 			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
 		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
 		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
 		cb->rdma_sq_wr.num_sge = 1;
 	}
 
 	switch(cb->mem) {
 	case FASTREG:
 
 		/* 
 		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
 		 * both unsignaled.  The client uses them to reregister
 		 * the rdma buffers with a new key each iteration.
 		 */
 		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
 		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
 		cb->fastreg_wr.wr.fast_reg.length = cb->size;
 		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
 		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
 
 		cb->invalidate_wr.next = &cb->fastreg_wr;
 		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
 		break;
 	case MW:
 		cb->bind_attr.wr_id = 0xabbaabba;
 		cb->bind_attr.send_flags = 0; /* unsignaled */
 #ifdef BIND_INFO
 		cb->bind_attr.bind_info.length = cb->size;
 #else
 		cb->bind_attr.length = cb->size;
 #endif
 		break;
 	default:
 		break;
 	}
 }
 
 static int krping_setup_buffers(struct krping_cb *cb)
 {
 	int ret;
 	struct ib_phys_buf buf;
 	u64 iovbase;
 
 	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
 
-	cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device, 
+	cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, 
 				   &cb->recv_buf, 
 				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
-	cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device, 
+	cb->send_dma_addr = ib_dma_map_single(cb->pd->device, 
 					   &cb->send_buf, sizeof(cb->send_buf),
 					   DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
 
 	if (cb->mem == DMA) {
 		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
 					   IB_ACCESS_REMOTE_READ|
 				           IB_ACCESS_REMOTE_WRITE);
 		if (IS_ERR(cb->dma_mr)) {
 			DEBUG_LOG(cb, "reg_dmamr failed\n");
 			ret = PTR_ERR(cb->dma_mr);
 			goto bail;
 		}
 	} else {
 		if (!cb->local_dma_lkey) {
 			buf.addr = cb->recv_dma_addr;
 			buf.size = sizeof cb->recv_buf;
 			DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n",
 			    (uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->recv_dma_addr;
 			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						     IB_ACCESS_LOCAL_WRITE, 
 						     &iovbase);
 
 			if (IS_ERR(cb->recv_mr)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->recv_mr);
 				goto bail;
 			}
 
 			buf.addr = cb->send_dma_addr;
 			buf.size = sizeof cb->send_buf;
 			DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n",
 			    (uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->send_dma_addr;
 			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						     0, &iovbase);
 
 			if (IS_ERR(cb->send_mr)) {
 				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->send_mr);
 				goto bail;
 			}
 		}
 	}
 
 	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
 	if (!cb->rdma_buf) {
 		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
 		ret = -ENOMEM;
 		goto bail;
 	}
 
-	cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device, 
+	cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device, 
 			       cb->rdma_buf, cb->size, 
 			       DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
 	if (cb->mem != DMA) {
 		switch (cb->mem) {
 		case FASTREG:
 			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
 				PAGE_SIZE) >> PAGE_SHIFT;
 			cb->page_list = ib_alloc_fast_reg_page_list(
 						cb->pd->device, 
 						cb->page_list_len);
 			if (IS_ERR(cb->page_list)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->page_list);
 				goto bail;
 			}
 			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, 
 					cb->page_list->max_page_list_len);
 			if (IS_ERR(cb->fastreg_mr)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->fastreg_mr);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
 				" page_list_len %u\n", cb->fastreg_mr->rkey, 
 				cb->page_list, cb->page_list_len);
 			break;
 		case MW:
 			cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1);
 			if (IS_ERR(cb->mw)) {
 				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
 				ret = PTR_ERR(cb->mw);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
 			/*FALLTHROUGH*/
 		case MR:
 			buf.addr = cb->rdma_dma_addr;
 			buf.size = cb->size;
 			iovbase = cb->rdma_dma_addr;
 			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						IB_ACCESS_LOCAL_WRITE|
 					     IB_ACCESS_REMOTE_READ| 
 					     IB_ACCESS_REMOTE_WRITE, 
 					     &iovbase);
 			if (IS_ERR(cb->rdma_mr)) {
 				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->rdma_mr);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n",
 				(uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey);
 			break;
 		default:
 			ret = -EINVAL;
 			goto bail;
 			break;
 		}
 	}
 
 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 
 		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
 		if (!cb->start_buf) {
 			DEBUG_LOG(cb, "start_buf malloc failed\n");
 			ret = -ENOMEM;
 			goto bail;
 		}
 
-		cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device, 
+		cb->start_dma_addr = ib_dma_map_single(cb->pd->device, 
 						   cb->start_buf, cb->size, 
 						   DMA_BIDIRECTIONAL);
 		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
 
 		if (cb->mem == MR || cb->mem == MW) {
 			unsigned flags = IB_ACCESS_REMOTE_READ;
 
 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 				flags |= IB_ACCESS_LOCAL_WRITE |
 					IB_ACCESS_REMOTE_WRITE;
 			}
 
 			buf.addr = cb->start_dma_addr;
 			buf.size = cb->size;
 			DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n",
 				(uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->start_dma_addr;
 			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 					     flags,
 					     &iovbase);
 
 			if (IS_ERR(cb->start_mr)) {
 				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->start_mr);
 				goto bail;
 			}
 		}
 	}
 
 	krping_setup_wr(cb);
 	DEBUG_LOG(cb, "allocated & registered buffers...\n");
 	return 0;
 bail:
 	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
 		ib_dereg_mr(cb->fastreg_mr);
 	if (cb->mw && !IS_ERR(cb->mw))
 		ib_dealloc_mw(cb->mw);
 	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
 		ib_dereg_mr(cb->rdma_mr);
 	if (cb->page_list && !IS_ERR(cb->page_list))
 		ib_free_fast_reg_page_list(cb->page_list);
 	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
 		ib_dereg_mr(cb->dma_mr);
 	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
 		ib_dereg_mr(cb->recv_mr);
 	if (cb->send_mr && !IS_ERR(cb->send_mr))
 		ib_dereg_mr(cb->send_mr);
 	if (cb->rdma_buf)
 		kfree(cb->rdma_buf);
 	if (cb->start_buf)
 		kfree(cb->start_buf);
 	return ret;
 }
 
 static void krping_free_buffers(struct krping_cb *cb)
 {
 	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
 	
 	if (cb->dma_mr)
 		ib_dereg_mr(cb->dma_mr);
 	if (cb->send_mr)
 		ib_dereg_mr(cb->send_mr);
 	if (cb->recv_mr)
 		ib_dereg_mr(cb->recv_mr);
 	if (cb->rdma_mr)
 		ib_dereg_mr(cb->rdma_mr);
 	if (cb->start_mr)
 		ib_dereg_mr(cb->start_mr);
 	if (cb->fastreg_mr)
 		ib_dereg_mr(cb->fastreg_mr);
 	if (cb->mw)
 		ib_dealloc_mw(cb->mw);
 
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, recv_mapping),
 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, send_mapping),
 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, rdma_mapping),
 			 cb->size, DMA_BIDIRECTIONAL);
 	kfree(cb->rdma_buf);
 	if (cb->start_buf) {
 		dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, start_mapping),
 			 cb->size, DMA_BIDIRECTIONAL);
 		kfree(cb->start_buf);
 	}
 }
 
 static int krping_create_qp(struct krping_cb *cb)
 {
 	struct ib_qp_init_attr init_attr;
 	int ret;
 
 	memset(&init_attr, 0, sizeof(init_attr));
 	init_attr.cap.max_send_wr = cb->txdepth;
 	init_attr.cap.max_recv_wr = 2;
 	init_attr.cap.max_recv_sge = 1;
 	init_attr.cap.max_send_sge = 1;
 	init_attr.qp_type = IB_QPT_RC;
 	init_attr.send_cq = cb->cq;
 	init_attr.recv_cq = cb->cq;
 	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 
 	if (cb->server) {
 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
 		if (!ret)
 			cb->qp = cb->child_cm_id->qp;
 	} else {
 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
 		if (!ret)
 			cb->qp = cb->cm_id->qp;
 	}
 
 	return ret;
 }
 
 static void krping_free_qp(struct krping_cb *cb)
 {
 	ib_destroy_qp(cb->qp);
 	ib_destroy_cq(cb->cq);
 	ib_dealloc_pd(cb->pd);
 }
 
 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
 {
 	int ret;
 	cb->pd = ib_alloc_pd(cm_id->device);
 	if (IS_ERR(cb->pd)) {
 		PRINTF(cb, "ib_alloc_pd failed\n");
 		return PTR_ERR(cb->pd);
 	}
 	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
 
 	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
 
 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
 			      cb, cb->txdepth * 2, 0);
 	if (IS_ERR(cb->cq)) {
 		PRINTF(cb, "ib_create_cq failed\n");
 		ret = PTR_ERR(cb->cq);
 		goto err1;
 	}
 	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
 
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 		if (ret) {
 			PRINTF(cb, "ib_create_cq failed\n");
 			goto err2;
 		}
 	}
 
 	ret = krping_create_qp(cb);
 	if (ret) {
 		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
 		goto err2;
 	}
 	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
 	return 0;
 err2:
 	ib_destroy_cq(cb->cq);
 err1:
 	ib_dealloc_pd(cb->pd);
 	return ret;
 }
 
 /*
  * return the (possibly rebound) rkey for the rdma buffer.
  * FASTREG mode: invalidate and rebind via fastreg wr.
  * MW mode: rebind the MW.
  * other modes: just return the mr rkey.
  */
 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
 {
 	u32 rkey = 0xffffffff;
 	u64 p;
 	struct ib_send_wr *bad_wr;
 	int i;
 	int ret;
 
 	switch (cb->mem) {
 	case FASTREG:
 		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
 
 		/*
 		 * Update the fastreg key.
 		 */
 		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
 		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
 
 		/*
 		 * Update the fastreg WR with new buf info.
 		 */
 		if (buf == (u64)cb->start_dma_addr)
 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
 		else
 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
 		p = (u64)(buf & PAGE_MASK);
 		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; 
 		     i++, p += PAGE_SIZE) {
 			cb->page_list->page_list[i] = p;
 			DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p);
 		}
 
 		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
 			" iova_start %jx page_list_len %u\n",
 			post_inv,
 			cb->fastreg_wr.wr.fast_reg.rkey,
 			cb->fastreg_wr.wr.fast_reg.page_shift,
 			(unsigned)cb->fastreg_wr.wr.fast_reg.length,
 			(uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start,
 			cb->fastreg_wr.wr.fast_reg.page_list_len);
 
 		if (post_inv)
 			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
 		else
 			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			cb->state = ERROR;
 		}
 		rkey = cb->fastreg_mr->rkey;
 		break;
 	case MW:
 		/*
 		 * Update the MW with new buf info.
 		 */
 		if (buf == (u64)cb->start_dma_addr) {
 #ifdef BIND_INFO
 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ;
 			cb->bind_attr.bind_info.mr = cb->start_mr;
 #else
 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
 			cb->bind_attr.mr = cb->start_mr;
 #endif
 		} else {
 #ifdef BIND_INFO
 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
 			cb->bind_attr.bind_info.mr = cb->rdma_mr;
 #else
 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
 			cb->bind_attr.mr = cb->rdma_mr;
 #endif
 		}
 #ifdef BIND_INFO
 		cb->bind_attr.bind_info.addr = buf;
 #else
 		cb->bind_attr.addr = buf;
 #endif
 		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n",
 #ifdef BIND_INFO
 			cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey);
 #else
 			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
 #endif
 		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
 		if (ret) {
 			PRINTF(cb, "bind mw error %d\n", ret);
 			cb->state = ERROR;
 		} else
 			rkey = cb->mw->rkey;
 		break;
 	case MR:
 		if (buf == (u64)cb->start_dma_addr)
 			rkey = cb->start_mr->rkey;
 		else
 			rkey = cb->rdma_mr->rkey;
 		break;
 	case DMA:
 		rkey = cb->dma_mr->rkey;
 		break;
 	default:
 		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
 		cb->state = ERROR;
 		break;
 	}
 	return rkey;
 }
 
 static void krping_format_send(struct krping_cb *cb, u64 buf)
 {
 	struct krping_rdma_info *info = &cb->send_buf;
 	u32 rkey;
 
 	/*
 	 * Client side will do fastreg or mw bind before
 	 * advertising the rdma buffer.  Server side
 	 * sends have no data.
 	 */
 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
 		info->buf = htonll(buf);
 		info->rkey = htonl(rkey);
 		info->size = htonl(cb->size);
 		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
 			  (unsigned long long)buf, rkey, cb->size);
 	}
 }
 
 static void krping_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr, inv;
 	int ret;
 
 	while (1) {
 		/* Wait for client's Start STAG/TO/Len */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
 		if (cb->state != RDMA_READ_ADV) {
 			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
 				cb->state);
 			break;
 		}
 
 		DEBUG_LOG(cb, "server received sink adv\n");
 
 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
 		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
 
 		/* Issue RDMA Read. */
 		if (cb->read_inv)
 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
 		else {
 
 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
 			if (cb->mem == FASTREG) {
 				/* 
 				 * Immediately follow the read with a 
 				 * fenced LOCAL_INV.
 				 */
 				cb->rdma_sq_wr.next = &inv;
 				memset(&inv, 0, sizeof inv);
 				inv.opcode = IB_WR_LOCAL_INV;
 				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
 				inv.send_flags = IB_SEND_FENCE;
 			}
 		}
 
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		cb->rdma_sq_wr.next = NULL;
 
 		DEBUG_LOG(cb, "server posted rdma read req \n");
 
 		/* Wait for read completion */
 		wait_event_interruptible(cb->sem, 
 					 cb->state >= RDMA_READ_COMPLETE);
 		if (cb->state != RDMA_READ_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_READ_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server received read complete\n");
 
 		/* Display data in recv buf */
 		if (cb->verbose) {
 			if (strlen(cb->rdma_buf) > 128) {
 				char msgbuf[128];
 
 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
 				PRINTF(cb, "server ping data stripped: %s\n",
 				       msgbuf);
 			} else
 				PRINTF(cb, "server ping data: %s\n",
 				       cb->rdma_buf);
 		}
 
 		/* Tell client to continue */
 		if (cb->server && cb->server_invalidate) {
 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
 		} 
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		DEBUG_LOG(cb, "server posted go ahead\n");
 
 		/* Wait for client's RDMA STAG/TO/Len */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
 		if (cb->state != RDMA_WRITE_ADV) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_ADV state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server received sink adv\n");
 
 		/* RDMA Write echo data */
 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
 		if (cb->local_dma_lkey)
 			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
 		else 
 			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
 			
 		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
 			  cb->rdma_sq_wr.sg_list->lkey,
 			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
 			  cb->rdma_sq_wr.sg_list->length);
 
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for completion */
 		ret = wait_event_interruptible(cb->sem, cb->state >= 
 							 RDMA_WRITE_COMPLETE);
 		if (cb->state != RDMA_WRITE_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server rdma write complete \n");
 
 		cb->state = CONNECTED;
 
 		/* Tell client to begin again */
 		if (cb->server && cb->server_invalidate) {
 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
 		} 
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		DEBUG_LOG(cb, "server posted go ahead\n");
 	}
 }
 
 static void rlat_test(struct krping_cb *cb)
 {
 	int scnt;
 	int iters = cb->count;
 	struct timeval start_tv, stop_tv;
 	int ret;
 	struct ib_wc wc;
 	struct ib_send_wr *bad_wr;
 	int ne;
 
 	scnt = 0;
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	microtime(&start_tv);
 	if (!cb->poll) {
 		cb->state = RDMA_READ_ADV;
 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 	}
 	while (scnt < iters) {
 
 		cb->state = RDMA_READ_ADV;
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, 
 				"Couldn't post send: ret=%d scnt %d\n",
 				ret, scnt);
 			return;
 		}
 
 		do {
 			if (!cb->poll) {
 				wait_event_interruptible(cb->sem, 
 					cb->state != RDMA_READ_ADV);
 				if (cb->state == RDMA_READ_COMPLETE) {
 					ne = 1;
 					ib_req_notify_cq(cb->cq, 
 						IB_CQ_NEXT_COMP);
 				} else {
 					ne = -1;
 				}
 			} else
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			if (cb->state == ERROR) {
 				PRINTF(cb, 
 					"state == ERROR...bailing scnt %d\n", 
 					scnt);
 				return;
 			}
 		} while (ne == 0);
 
 		if (ne < 0) {
 			PRINTF(cb, "poll CQ failed %d\n", ne);
 			return;
 		}
 		if (cb->poll && wc.status != IB_WC_SUCCESS) {
 			PRINTF(cb, "Completion wth error at %s:\n",
 				cb->server ? "server" : "client");
 			PRINTF(cb, "Failed status %d: wr_id %d\n",
 				wc.status, (int) wc.wr_id);
 			return;
 		}
 		++scnt;
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size);
 }
 
 static void wlat_test(struct krping_cb *cb)
 {
 	int ccnt, scnt, rcnt;
 	int iters=cb->count;
 	volatile char *poll_buf = (char *) cb->start_buf;
 	char *buf = (char *)cb->rdma_buf;
 	struct timeval start_tv, stop_tv;
 	cycles_t *post_cycles_start, *post_cycles_stop;
 	cycles_t *poll_cycles_start, *poll_cycles_stop;
 	cycles_t *last_poll_cycles_start;
 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
 	int i;
 	int cycle_iters = 1000;
 
 	ccnt = 0;
 	scnt = 0;
 	rcnt = 0;
 
 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
 		GFP_KERNEL);
 	if (!last_poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	if (cycle_iters > iters)
 		cycle_iters = iters;
 	microtime(&start_tv);
 	while (scnt < iters || ccnt < iters || rcnt < iters) {
 
 		/* Wait till buffer changes. */
 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
 			++rcnt;
 			while (*poll_buf != (char)rcnt) {
 				if (cb->state == ERROR) {
 					PRINTF(cb, 
 						"state = ERROR, bailing\n");
 					return;
 				}
 			}
 		}
 
 		if (scnt < iters) {
 			struct ib_send_wr *bad_wr;
 
 			*buf = (char)scnt+1;
 			if (scnt < cycle_iters)
 				post_cycles_start[scnt] = get_cycles();
 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 				PRINTF(cb, 
 					"Couldn't post send: scnt=%d\n",
 					scnt);
 				return;
 			}
 			if (scnt < cycle_iters)
 				post_cycles_stop[scnt] = get_cycles();
 			scnt++;
 		}
 
 		if (ccnt < iters) {
 			struct ib_wc wc;
 			int ne;
 
 			if (ccnt < cycle_iters)
 				poll_cycles_start[ccnt] = get_cycles();
 			do {
 				if (ccnt < cycle_iters)
 					last_poll_cycles_start[ccnt] = 
 						get_cycles();
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			} while (ne == 0);
 			if (ccnt < cycle_iters)
 				poll_cycles_stop[ccnt] = get_cycles();
 			++ccnt;
 
 			if (ne < 0) {
 				PRINTF(cb, "poll CQ failed %d\n", ne);
 				return;
 			}
 			if (wc.status != IB_WC_SUCCESS) {
 				PRINTF(cb, 
 					"Completion wth error at %s:\n",
 					cb->server ? "server" : "client");
 				PRINTF(cb, 
 					"Failed status %d: wr_id %d\n",
 					wc.status, (int) wc.wr_id);
 				PRINTF(cb, 
 					"scnt=%d, rcnt=%d, ccnt=%d\n",
 					scnt, rcnt, ccnt);
 				return;
 			}
 		}
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	for (i=0; i < cycle_iters; i++) {
 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
 	}
 	PRINTF(cb,
 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size, cycle_iters,
 		(unsigned long long)sum_post, (unsigned long long)sum_poll, 
 		(unsigned long long)sum_last_poll);
 	kfree(post_cycles_start);
 	kfree(post_cycles_stop);
 	kfree(poll_cycles_start);
 	kfree(poll_cycles_stop);
 	kfree(last_poll_cycles_start);
 }
 
 static void bw_test(struct krping_cb *cb)
 {
 	int ccnt, scnt, rcnt;
 	int iters=cb->count;
 	struct timeval start_tv, stop_tv;
 	cycles_t *post_cycles_start, *post_cycles_stop;
 	cycles_t *poll_cycles_start, *poll_cycles_stop;
 	cycles_t *last_poll_cycles_start;
 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
 	int i;
 	int cycle_iters = 1000;
 
 	ccnt = 0;
 	scnt = 0;
 	rcnt = 0;
 
 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
 		GFP_KERNEL);
 	if (!last_poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	if (cycle_iters > iters)
 		cycle_iters = iters;
 	microtime(&start_tv);
 	while (scnt < iters || ccnt < iters) {
 
 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
 			struct ib_send_wr *bad_wr;
 
 			if (scnt < cycle_iters)
 				post_cycles_start[scnt] = get_cycles();
 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 				PRINTF(cb, 
 					"Couldn't post send: scnt=%d\n",
 					scnt);
 				return;
 			}
 			if (scnt < cycle_iters)
 				post_cycles_stop[scnt] = get_cycles();
 			++scnt;
 		}
 
 		if (ccnt < iters) {
 			int ne;
 			struct ib_wc wc;
 
 			if (ccnt < cycle_iters)
 				poll_cycles_start[ccnt] = get_cycles();
 			do {
 				if (ccnt < cycle_iters)
 					last_poll_cycles_start[ccnt] = 
 						get_cycles();
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			} while (ne == 0);
 			if (ccnt < cycle_iters)
 				poll_cycles_stop[ccnt] = get_cycles();
 			ccnt += 1;
 
 			if (ne < 0) {
 				PRINTF(cb, "poll CQ failed %d\n", ne);
 				return;
 			}
 			if (wc.status != IB_WC_SUCCESS) {
 				PRINTF(cb, 
 					"Completion wth error at %s:\n",
 					cb->server ? "server" : "client");
 				PRINTF(cb, 
 					"Failed status %d: wr_id %d\n",
 					wc.status, (int) wc.wr_id);
 				return;
 			}
 		}
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	for (i=0; i < cycle_iters; i++) {
 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
 	}
 	PRINTF(cb,
 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size, cycle_iters, 
 		(unsigned long long)sum_post, (unsigned long long)sum_poll, 
 		(unsigned long long)sum_last_poll);
 	kfree(post_cycles_start);
 	kfree(post_cycles_stop);
 	kfree(poll_cycles_start);
 	kfree(poll_cycles_stop);
 	kfree(last_poll_cycles_start);
 }
 
 static void krping_rlat_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_wlat_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	wlat_test(cb);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_bw_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		bw_test(cb);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static int fastreg_supported(struct krping_cb *cb, int server)
 {
 	struct ib_device *dev = server?cb->child_cm_id->device:
 					cb->cm_id->device;
 	struct ib_device_attr attr;
 	int ret;
 
 	ret = ib_query_device(dev, &attr);
 	if (ret) {
 		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
 		return 0;
 	}
 	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
 		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n",
 		    (unsigned long long)attr.device_cap_flags);
 		return 0;
 	}
 	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n",
 		(uintmax_t)attr.device_cap_flags);
 	return 1;
 }
 
 static int krping_bind_server(struct krping_cb *cb)
 {
 	struct sockaddr_in sin;
 	int ret;
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof sin;
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = cb->addr.s_addr;
 	sin.sin_port = cb->port;
 
 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
 	if (ret) {
 		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
 		return ret;
 	}
 	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
 
 	DEBUG_LOG(cb, "rdma_listen\n");
 	ret = rdma_listen(cb->cm_id, 3);
 	if (ret) {
 		PRINTF(cb, "rdma_listen failed: %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
 	if (cb->state != CONNECT_REQUEST) {
 		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
 			cb->state);
 		return -1;
 	}
 
 	if (cb->mem == FASTREG && !fastreg_supported(cb, 1))
 		return -EINVAL;
 
 	return 0;
 }
 
 /*
  * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads
  * complete.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  */
 static void krping_fr_test5(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list **pl;
 	struct ib_send_wr *fr, *read, *bad;
 	struct ib_wc wc;
 	struct ib_sge *sgl;
 	u8 key = 0;
 	struct ib_mr **mr;
 	u8 **buf;
 	dma_addr_t *dma_addr;
 	int i;
 	int ret;
 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	time_t start;
 	int count = 0;
 	int scnt;
 	int depth = cb->txdepth >> 1;
 
 	if (!depth) {
 		PRINTF(cb, "txdepth must be > 1 for this test!\n");
 		return;
 	}
 
 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
 	read = kzalloc(sizeof *read * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth);
 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
 	if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) {
 		PRINTF(cb, "kzalloc failed\n");
 		goto err1;
 	}
 
 	for (scnt = 0; scnt < depth; scnt++) {
 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 		if (IS_ERR(pl[scnt])) {
 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
 			       PTR_ERR(pl[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
 
 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
 		if (IS_ERR(mr[scnt])) {
 			PRINTF(cb, "alloc_fr failed %ld\n",
 			       PTR_ERR(mr[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
 		ib_update_fast_reg_key(mr[scnt], ++key);
 
 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
 		if (!buf[scnt]) {
 			PRINTF(cb, "kmalloc failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
-		dma_addr[scnt] = dma_map_single(cb->pd->device->dma_device,
+		dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
 						   buf[scnt], cb->size,
 						   DMA_BIDIRECTIONAL);
 		if (dma_mapping_error(cb->pd->device->dma_device,
 		    dma_addr[scnt])) {
 			PRINTF(cb, "dma_map failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
 		for (i=0; i<plen; i++) {
 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
 		}
 
 		sgl[scnt].lkey = mr[scnt]->rkey;
 		sgl[scnt].length = cb->size;
 		sgl[scnt].addr = (u64)buf[scnt];
 		DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n",
 			  __func__, scnt,  sgl[scnt].lkey, sgl[scnt].length,
 			  (uintmax_t)sgl[scnt].addr);
 
 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
 		fr[scnt].wr_id = scnt;
 		fr[scnt].send_flags = 0;
 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
 		fr[scnt].wr.fast_reg.length = cb->size;
 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
 		fr[scnt].wr.fast_reg.page_list_len = plen;
 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
 		fr[scnt].next = &read[scnt];
 		read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV;
 		read[scnt].wr_id = scnt;
 		read[scnt].send_flags = IB_SEND_SIGNALED;
 		read[scnt].wr.rdma.rkey = cb->remote_rkey;
 		read[scnt].wr.rdma.remote_addr = cb->remote_addr;
 		read[scnt].num_sge = 1;
 		read[scnt].sg_list = &sgl[scnt];
 		ret = ib_post_send(cb->qp, &fr[scnt], &bad);
 		if (ret) {
 			PRINTF(cb, "ib_post_send failed %d\n", ret);
 			goto err2;
 		}
 	}
 
 	start = time_uptime;
 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
 	while (!cb->count || cb->server || count < cb->count) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
 				  count);
 			wait_event_interruptible_timeout(cb->sem,
 							 cb->state == ERROR,
 							 1);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n",
 				       ret);
 				goto err2;
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb,
 					       "completion error %u wr_id %ju "
 					       "opcode %d\n", wc.status,
 					       (uintmax_t)wc.wr_id, wc.opcode);
 					goto err2;
 				}
 				count++;
 				if (count == cb->count)
 					break;
 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
 				fr[wc.wr_id].wr.fast_reg.rkey =
 					mr[wc.wr_id]->rkey;
 				sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey;
 				ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad);
 				if (ret) {
 					PRINTF(cb,
 					       "ib_post_send failed %d\n", ret);
 					goto err2;
 				}
 			} else if (krping_sigpending()) {
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u "
 				       "opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 
 	DEBUG_LOG(cb, "destroying fr mrs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (mr[scnt]) {
 			ib_dereg_mr(mr[scnt]);
 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (buf[scnt]) {
 			dma_unmap_single(cb->pd->device->dma_device,
 					 dma_addr[scnt], cb->size,
 					 DMA_BIDIRECTIONAL);
 			kfree(buf[scnt]);
 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "destroying fr page lists!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (pl[scnt]) {
 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
 			ib_free_fast_reg_page_list(pl[scnt]);
 		}
 	}
 err1:
 	if (pl)
 		kfree(pl);
 	if (mr)
 		kfree(mr);
 	if (fr)
 		kfree(fr);
 	if (read)
 		kfree(read);
 	if (sgl)
 		kfree(sgl);
 	if (buf)
 		kfree(buf);
 	if (dma_addr)
 		kfree(dma_addr);
 }
 static void krping_fr_test_server(struct krping_cb *cb)
 {
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test5_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		krping_fr_test5(cb);
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test5_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to server */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
 	    (uintmax_t)cb->remote_addr);
 
 	return krping_fr_test5(cb);
 }
 
 /*
  * sq-depth worth of write + fastreg + inv, reposting them as the invs
  * complete.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  * If a count is given, then the last IO will have a bogus lkey in the
  * write work request.  This reproduces a fw bug where the connection
  * will get stuck if a fastreg is processed while the ulptx is failing
  * the bad write.
  */
 static void krping_fr_test6(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list **pl;
 	struct ib_send_wr *fr, *write, *inv, *bad;
 	struct ib_wc wc;
 	struct ib_sge *sgl;
 	u8 key = 0;
 	struct ib_mr **mr;
 	u8 **buf;
 	dma_addr_t *dma_addr;
 	int i;
 	int ret;
 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	unsigned long start;
 	int count = 0;
 	int scnt;
 	int depth = cb->txdepth  / 3;
 
 	if (!depth) {
 		PRINTF(cb, "txdepth must be > 3 for this test!\n");
 		return;
 	}
 
 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
 
 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
 
 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
 
 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
 
 	write = kzalloc(sizeof *write * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth);
 
 	inv = kzalloc(sizeof *inv * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth);
 
 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
 
 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
 
 	if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) {
 		PRINTF(cb, "kzalloc failed\n");
 		goto err1;
 	}
 
 	for (scnt = 0; scnt < depth; scnt++) {
 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 		if (IS_ERR(pl[scnt])) {
 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
 			       PTR_ERR(pl[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
 
 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
 		if (IS_ERR(mr[scnt])) {
 			PRINTF(cb, "alloc_fr failed %ld\n",
 			       PTR_ERR(mr[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
 		ib_update_fast_reg_key(mr[scnt], ++key);
 
 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
 		if (!buf[scnt]) {
 			PRINTF(cb, "kmalloc failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
-		dma_addr[scnt] = dma_map_single(cb->pd->device->dma_device,
+		dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
 						   buf[scnt], cb->size,
 						   DMA_BIDIRECTIONAL);
 		if (dma_mapping_error(cb->pd->device->dma_device,
 		    dma_addr[scnt])) {
 			PRINTF(cb, "dma_map failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
 		for (i=0; i<plen; i++) {
 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
 		}
 
 		write[scnt].opcode = IB_WR_RDMA_WRITE;
 		write[scnt].wr_id = scnt;
 		write[scnt].wr.rdma.rkey = cb->remote_rkey;
 		write[scnt].wr.rdma.remote_addr = cb->remote_addr;
 		write[scnt].num_sge = 1;
 		write[scnt].sg_list = &cb->rdma_sgl;
 		write[scnt].sg_list->length = cb->size;
 		write[scnt].next = &fr[scnt];
 
 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
 		fr[scnt].wr_id = scnt;
 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
 		fr[scnt].wr.fast_reg.length = cb->size;
 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
 		fr[scnt].wr.fast_reg.page_list_len = plen;
 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
 		fr[scnt].next = &inv[scnt];
 
 		inv[scnt].opcode = IB_WR_LOCAL_INV;
 		inv[scnt].send_flags = IB_SEND_SIGNALED;
 		inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey;
 
 		ret = ib_post_send(cb->qp, &write[scnt], &bad);
 		if (ret) {
 			PRINTF(cb, "ib_post_send failed %d\n", ret);
 			goto err2;
 		}
 	}
 
 	start = time_uptime;
 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
 	while (!cb->count || cb->server || count < cb->count) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
 				  count);
 			wait_event_interruptible_timeout(cb->sem,
 							 cb->state == ERROR,
 							 1);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n",
 				       ret);
 				goto err2;
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb,
 					       "completion error %u wr_id %ju "
 					       "opcode %d\n", wc.status,
 					       (uintmax_t)wc.wr_id, wc.opcode);
 					goto err2;
 				}
 				count++;
 				if (count == (cb->count -1))
 					cb->rdma_sgl.lkey = 0x00dead;
 				if (count == cb->count)
 					break;
 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
 				fr[wc.wr_id].wr.fast_reg.rkey =
 					mr[wc.wr_id]->rkey;
 				inv[wc.wr_id].ex.invalidate_rkey =
 					mr[wc.wr_id]->rkey;
 				ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad);
 				if (ret) {
 					PRINTF(cb,
 					       "ib_post_send failed %d\n", ret);
 					goto err2;
 				}
 			} else if (krping_sigpending()){
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u "
 				       "opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 
 	DEBUG_LOG(cb, "destroying fr mrs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (mr[scnt]) {
 			ib_dereg_mr(mr[scnt]);
 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (buf[scnt]) {
 			dma_unmap_single(cb->pd->device->dma_device,
 					 dma_addr[scnt], cb->size,
 					 DMA_BIDIRECTIONAL);
 			kfree(buf[scnt]);
 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "destroying fr page lists!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (pl[scnt]) {
 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
 			ib_free_fast_reg_page_list(pl[scnt]);
 		}
 	}
 err1:
 	if (pl)
 		kfree(pl);
 	if (mr)
 		kfree(mr);
 	if (fr)
 		kfree(fr);
 	if (write)
 		kfree(write);
 	if (inv)
 		kfree(inv);
 	if (sgl)
 		kfree(sgl);
 	if (buf)
 		kfree(buf);
 	if (dma_addr)
 		kfree(dma_addr);
 }
 
 static void krping_fr_test6_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		krping_fr_test6(cb);
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test6_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to server */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
 	    (uintmax_t)cb->remote_addr);
 
 	return krping_fr_test6(cb);
 }
 
 static void krping_run_server(struct krping_cb *cb)
 {
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	ret = krping_bind_server(cb);
 	if (ret)
 		return;
 
 	ret = krping_setup_qp(cb, cb->child_cm_id);
 	if (ret) {
 		PRINTF(cb, "setup_qp failed: %d\n", ret);
 		goto err0;
 	}
 
 	ret = krping_setup_buffers(cb);
 	if (ret) {
 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
 		goto err1;
 	}
 
 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
 		goto err2;
 	}
 
 	ret = krping_accept(cb);
 	if (ret) {
 		PRINTF(cb, "connect error %d\n", ret);
 		goto err2;
 	}
 
 	if (cb->wlat)
 		krping_wlat_test_server(cb);
 	else if (cb->rlat)
 		krping_rlat_test_server(cb);
 	else if (cb->bw)
 		krping_bw_test_server(cb);
 	else if (cb->frtest) {
 		switch (cb->testnum) {
 		case 1:
 		case 2:
 		case 3:
 		case 4:
 			krping_fr_test_server(cb);
 			break;
 		case 5:
 			krping_fr_test5_server(cb);
 			break;
 		case 6:
 			krping_fr_test6_server(cb);
 			break;
 		default:
 			PRINTF(cb, "unknown fr test %d\n", cb->testnum);
 			goto err2;
 			break;
 		}
 	} else
 		krping_test_server(cb);
 	rdma_disconnect(cb->child_cm_id);
 err2:
 	krping_free_buffers(cb);
 err1:
 	krping_free_qp(cb);
 err0:
 	rdma_destroy_id(cb->child_cm_id);
 }
 
 static void krping_test_client(struct krping_cb *cb)
 {
 	int ping, start, cc, i, ret;
 	struct ib_send_wr *bad_wr;
 	unsigned char c;
 
 	start = 65;
 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
 		cb->state = RDMA_READ_ADV;
 
 		/* Put some ascii text in the buffer. */
 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
 		for (i = cc, c = start; i < cb->size; i++) {
 			cb->start_buf[i] = c;
 			c++;
 			if (c > 122)
 				c = 65;
 		}
 		start++;
 		if (start > 122)
 			start = 65;
 		cb->start_buf[cb->size - 1] = 0;
 
 		krping_format_send(cb, cb->start_dma_addr);
 		if (cb->state == ERROR) {
 			PRINTF(cb, "krping_format_send failed\n");
 			break;
 		}
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for server to ACK */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
 		if (cb->state != RDMA_WRITE_ADV) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_ADV state %d\n",
 			       cb->state);
 			break;
 		}
 
 		krping_format_send(cb, cb->rdma_dma_addr);
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for the server to say the RDMA Write is complete. */
 		wait_event_interruptible(cb->sem, 
 					 cb->state >= RDMA_WRITE_COMPLETE);
 		if (cb->state != RDMA_WRITE_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 
 		if (cb->validate)
 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
 				PRINTF(cb, "data mismatch!\n");
 				break;
 			}
 
 		if (cb->verbose) {
 			if (strlen(cb->rdma_buf) > 128) {
 				char msgbuf[128];
 
 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
 				PRINTF(cb, "ping data stripped: %s\n",
 				       msgbuf);
 			} else
 				PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
 		}
 #ifdef SLOW_KRPING
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 #endif
 	}
 }
 
 static void krping_rlat_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 #if 0
 {
 	int i;
 	struct timeval start, stop;
 	time_t sec;
 	suseconds_t usec;
 	unsigned long long elapsed;
 	struct ib_wc wc;
 	struct ib_send_wr *bad_wr;
 	int ne;
 	
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = 0;
 	cb->rdma_sq_wr.num_sge = 0;
 
 	microtime(&start);
 	for (i=0; i < 100000; i++) {
 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 			PRINTF(cb, "Couldn't post send\n");
 			return;
 		}
 		do {
 			ne = ib_poll_cq(cb->cq, 1, &wc);
 		} while (ne == 0);
 		if (ne < 0) {
 			PRINTF(cb, "poll CQ failed %d\n", ne);
 			return;
 		}
 		if (wc.status != IB_WC_SUCCESS) {
 			PRINTF(cb, "Completion wth error at %s:\n",
 				cb->server ? "server" : "client");
 			PRINTF(cb, "Failed status %d: wr_id %d\n",
 				wc.status, (int) wc.wr_id);
 			return;
 		}
 	}
 	microtime(&stop);
 	
 	if (stop.tv_usec < start.tv_usec) {
 		stop.tv_usec += 1000000;
 		stop.tv_sec  -= 1;
 	}
 	sec     = stop.tv_sec - start.tv_sec;
 	usec    = stop.tv_usec - start.tv_usec;
 	elapsed = sec * 1000000 + usec;
 	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
 }
 #endif
 
 	rlat_test(cb);
 }
 
 static void krping_wlat_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	wlat_test(cb);
 }
 
 static void krping_bw_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	bw_test(cb);
 }
 
 
 /*
  * fastreg 2 valid different mrs and verify the completions.
  */
 static void krping_fr_test1(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1, *mr2;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 	mr2 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr2)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err2;
 	}
 
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	fr.wr.fast_reg.rkey = mr2->rkey;
 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 2);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr2!\n");
 
 	ib_dereg_mr(mr2);
 err2:
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg the same mr twice, 2nd one should produce error cqe.
  */
 static void krping_fr_test2(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 2);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg pipelined in a loop as fast as we can until the user interrupts.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  */
 static void krping_fr_test3(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, inv, *bad;
 	struct ib_wc wc;
 	u8 key = 0;
 	struct ib_mr *mr;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	unsigned long start;
 	int count = 0;
 	int scnt = 0;
 
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 	
 	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 	
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.next = &inv;
 	memset(&inv, 0, sizeof inv);
 	inv.opcode = IB_WR_LOCAL_INV;
 	inv.send_flags = IB_SEND_SIGNALED;
 	
 	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
 	start = time_uptime;
 	while (1) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
 			wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}	
 		while (scnt < (cb->txdepth>>1)) {
 			ib_update_fast_reg_key(mr, ++key);
 			fr.wr.fast_reg.rkey = mr->rkey;
 			inv.ex.invalidate_rkey = mr->rkey;
 			size = arc4random() % cb->size;
 			if (size == 0)
 				size = cb->size;
 			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 			fr.wr.fast_reg.length = size;
 			fr.wr.fast_reg.page_list_len = plen;
 			ret = ib_post_send(cb->qp, &fr, &bad);
 			if (ret) {
 				PRINTF(cb, "ib_post_send failed %d\n", ret);
 				goto err2;	
 			}
 			scnt+=2;
 		}
 
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 				goto err2;	
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb, "completion error %u\n", wc.status);
 					goto err2;
 				}
 				count++;
 				scnt--;
 			}
 			else if (krping_sigpending()) {
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "fr_test: done!\n");
 	ib_dereg_mr(mr);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg 1 and invalidate 1 mr and verify completion.
  */
 static void krping_fr_test4(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, inv, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	fr.next = &inv;
 	memset(&inv, 0, sizeof inv);
 	inv.opcode = IB_WR_LOCAL_INV;
 	inv.ex.invalidate_rkey = mr1->rkey;
 
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 1);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 static void krping_fr_test(struct krping_cb *cb)
 {
 	switch (cb->testnum) {
 	case 1:
 		krping_fr_test1(cb);
 		break;
 	case 2:
 		krping_fr_test2(cb);
 		break;
 	case 3:
 		krping_fr_test3(cb);
 		break;
 	case 4:
 		krping_fr_test4(cb);
 		break;
 	case 5:
 		krping_fr_test5_client(cb);
 		break;
 	case 6:
 		krping_fr_test6_client(cb);
 		break;
 	default:
 		PRINTF(cb, "Unkown frtest num %u\n", cb->testnum);
 		break;
 	}
 }
 
 static int krping_connect_client(struct krping_cb *cb)
 {
 	struct rdma_conn_param conn_param;
 	int ret;
 
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 1;
 	conn_param.initiator_depth = 1;
 	conn_param.retry_count = 10;
 
 	ret = rdma_connect(cb->cm_id, &conn_param);
 	if (ret) {
 		PRINTF(cb, "rdma_connect error %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
 		return -1;
 	}
 
 	DEBUG_LOG(cb, "rdma_connect successful\n");
 	return 0;
 }
 
 static int krping_bind_client(struct krping_cb *cb)
 {
 	struct sockaddr_in sin;
 	int ret;
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof sin;
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = cb->addr.s_addr;
 	sin.sin_port = cb->port;
 
 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
 				2000);
 	if (ret) {
 		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
 	if (cb->state != ROUTE_RESOLVED) {
 		PRINTF(cb, 
 		       "addr/route resolution did not resolve: state %d\n",
 		       cb->state);
 		return -EINTR;
 	}
 
 	if (cb->mem == FASTREG && !fastreg_supported(cb, 0))
 		return -EINVAL;
 
 	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
 	return 0;
 }
 
 static void krping_run_client(struct krping_cb *cb)
 {
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	ret = krping_bind_client(cb);
 	if (ret)
 		return;
 
 	ret = krping_setup_qp(cb, cb->cm_id);
 	if (ret) {
 		PRINTF(cb, "setup_qp failed: %d\n", ret);
 		return;
 	}
 
 	ret = krping_setup_buffers(cb);
 	if (ret) {
 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
 		goto err1;
 	}
 
 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
 		goto err2;
 	}
 
 	ret = krping_connect_client(cb);
 	if (ret) {
 		PRINTF(cb, "connect error %d\n", ret);
 		goto err2;
 	}
 
 	if (cb->wlat)
 		krping_wlat_test_client(cb);
 	else if (cb->rlat)
 		krping_rlat_test_client(cb);
 	else if (cb->bw)
 		krping_bw_test_client(cb);
 	else if (cb->frtest)
 		krping_fr_test(cb);
 	else
 		krping_test_client(cb);
 	rdma_disconnect(cb->cm_id);
 err2:
 	krping_free_buffers(cb);
 err1:
 	krping_free_qp(cb);
 }
 
 int krping_doit(char *cmd, void *cookie)
 {
 	struct krping_cb *cb;
 	int op;
 	int ret = 0;
 	char *optarg;
 	unsigned long optint;
 
 	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
 	if (!cb)
 		return -ENOMEM;
 
 	mutex_lock(&krping_mutex);
 	list_add_tail(&cb->list, &krping_cbs);
 	mutex_unlock(&krping_mutex);
 
 	cb->cookie = cookie;
 	cb->server = -1;
 	cb->state = IDLE;
 	cb->size = 64;
 	cb->txdepth = RPING_SQ_DEPTH;
 	cb->mem = DMA;
 	init_waitqueue_head(&cb->sem);
 
 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
 			      &optint)) != 0) {
 		switch (op) {
 		case 'a':
 			cb->addr_str = optarg;
 			DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
 			if (!inet_aton(optarg, &cb->addr)) {
 				PRINTF(cb, "bad addr string %s\n",
 				    optarg);
 				ret = EINVAL;
 			}
 			break;
 		case 'p':
 			cb->port = htons(optint);
 			DEBUG_LOG(cb, "port %d\n", (int)optint);
 			break;
 		case 'P':
 			cb->poll = 1;
 			DEBUG_LOG(cb, "server\n");
 			break;
 		case 's':
 			cb->server = 1;
 			DEBUG_LOG(cb, "server\n");
 			break;
 		case 'c':
 			cb->server = 0;
 			DEBUG_LOG(cb, "client\n");
 			break;
 		case 'S':
 			cb->size = optint;
 			if ((cb->size < 1) ||
 			    (cb->size > RPING_BUFSIZE)) {
 				PRINTF(cb, "Invalid size %d "
 				       "(valid range is 1 to %d)\n",
 				       cb->size, RPING_BUFSIZE);
 				ret = EINVAL;
 			} else
 				DEBUG_LOG(cb, "size %d\n", (int)optint);
 			break;
 		case 'C':
 			cb->count = optint;
 			if (cb->count < 0) {
 				PRINTF(cb, "Invalid count %d\n",
 					cb->count);
 				ret = EINVAL;
 			} else
 				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
 			break;
 		case 'v':
 			cb->verbose++;
 			DEBUG_LOG(cb, "verbose\n");
 			break;
 		case 'V':
 			cb->validate++;
 			DEBUG_LOG(cb, "validate data\n");
 			break;
 		case 'l':
 			cb->wlat++;
 			break;
 		case 'L':
 			cb->rlat++;
 			break;
 		case 'B':
 			cb->bw++;
 			break;
 		case 'd':
 			cb->duplex++;
 			break;
 		case 'm':
 			if (!strncmp(optarg, "dma", 3))
 				cb->mem = DMA;
 			else if (!strncmp(optarg, "fastreg", 7))
 				cb->mem = FASTREG;
 			else if (!strncmp(optarg, "mw", 2))
 				cb->mem = MW;
 			else if (!strncmp(optarg, "mr", 2))
 				cb->mem = MR;
 			else {
 				PRINTF(cb, "unknown mem mode %s.  "
 					"Must be dma, fastreg, mw, or mr\n",
 					optarg);
 				ret = -EINVAL;
 				break;
 			}
 			break;
 		case 'I':
 			cb->server_invalidate = 1;
 			break;
 		case 'T':
 			cb->txdepth = optint;
 			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
 			break;
 		case 'Z':
 			cb->local_dma_lkey = 1;
 			DEBUG_LOG(cb, "using local dma lkey\n");
 			break;
 		case 'R':
 			cb->read_inv = 1;
 			DEBUG_LOG(cb, "using read-with-inv\n");
 			break;
 		case 'f':
 			cb->frtest = 1;
 			cb->testnum = optint;
 			DEBUG_LOG(cb, "fast-reg test!\n");
 			break;
 		default:
 			PRINTF(cb, "unknown opt %s\n", optarg);
 			ret = -EINVAL;
 			break;
 		}
 	}
 	if (ret)
 		goto out;
 
 	if (cb->server == -1) {
 		PRINTF(cb, "must be either client or server\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
 		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
 		ret = -EINVAL;
 		goto out;
 	}
 	if (cb->server_invalidate && cb->mem != FASTREG) {
 		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (cb->read_inv && cb->mem != FASTREG) {
 		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) {
 		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(cb->cm_id)) {
 		ret = PTR_ERR(cb->cm_id);
 		PRINTF(cb, "rdma_create_id error %d\n", ret);
 		goto out;
 	}
 	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
 
 	if (cb->server)
 		krping_run_server(cb);
 	else
 		krping_run_client(cb);
 
 	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
 	rdma_destroy_id(cb->cm_id);
 out:
 	mutex_lock(&krping_mutex);
 	list_del(&cb->list);
 	mutex_unlock(&krping_mutex);
 	kfree(cb);
 	return ret;
 }
 
 void
 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
 {
 	struct krping_cb *cb;
 
 	mutex_lock(&krping_mutex);
 	list_for_each_entry(cb, &krping_cbs, list)
 	    (*f)(cb->pd ? &cb->stats : NULL, arg);
 	mutex_unlock(&krping_mutex);
 }
 
 void krping_init(void)
 {
 
 	mutex_init(&krping_mutex);
 }
Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c	(revision 307896)
@@ -1,5259 +1,5242 @@
 /*-
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <machine/in_cksum.h>
 #include <machine/md_var.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #ifdef DEV_NETMAP
 #include <machine/bus.h>
 #include <sys/selinfo.h>
 #include <net/if_var.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
 #define RX_COPY_THRESHOLD MINCLSIZE
 #endif
 
 /*
  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
  * 0-7 are valid values.
  */
 static int fl_pktshift = 2;
 TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
 
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
  *  0: disable padding.
  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 int fl_pad = -1;
 TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
 
 /*
  * Status page length.
  * -1: driver should figure out a good value.
  *  64 or 128 are the only other valid values.
  */
 static int spg_len = -1;
 TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
 
 /*
  * Congestion drops.
  * -1: no congestion feedback (not recommended).
  *  0: backpressure the channel instead of dropping packets right away.
  *  1: no backpressure, drop packets for the congested queue immediately.
  */
 static int cong_drop = 0;
 TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
 
 /*
  * Deliver multiple frames in the same free list buffer if they fit.
  * -1: let the driver decide whether to enable buffer packing or not.
  *  0: disable buffer packing.
  *  1: enable buffer packing.
  */
 static int buffer_packing = -1;
 TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
 
 /*
  * Start next frame in a packed buffer at this boundary.
  * -1: driver should figure out a good value.
  * T4: driver will ignore this and use the same value as fl_pad above.
  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
  */
 static int fl_pack = -1;
 TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
 
 /*
  * Allow the driver to create mbuf(s) in a cluster allocated for rx.
  * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
  * 1: ok to create mbuf(s) within a cluster if there is room.
  */
 static int allow_mbufs_in_cluster = 1;
 TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
 
 /*
  * Largest rx cluster size that the driver is allowed to allocate.
  */
 static int largest_rx_cluster = MJUM16BYTES;
 TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
 
 /*
  * Size of cluster allocation that's most likely to succeed.  The driver will
  * fall back to this size if it fails to allocate clusters larger than this.
  */
 static int safest_rx_cluster = PAGE_SIZE;
 TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
 
 struct txpkts {
 	u_int wr_type;		/* type 0 or type 1 */
 	u_int npkt;		/* # of packets in this work request */
 	u_int plen;		/* total payload (sum of all packets) */
 	u_int len16;		/* # of 16B pieces used by this work request */
 };
 
 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
 struct sgl {
 	struct sglist sg;
 	struct sglist_seg seg[TX_SGL_SEGS];
 };
 
 static int service_iq(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
     uint16_t, char *);
 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
     bus_addr_t *, void **);
 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
     void *);
 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
     int, int);
 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid *, struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static int free_fwq(struct adapter *);
 static int alloc_mgmtq(struct adapter *);
 static int free_mgmtq(struct adapter *);
 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
     struct sysctl_oid *);
 static int free_rxq(struct vi_info *, struct sge_rxq *);
 #ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
     struct sysctl_oid *);
 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
 #endif
 #ifdef DEV_NETMAP
 static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
     struct sysctl_oid *);
 static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
 static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
     struct sysctl_oid *);
 static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #ifdef TCP_OFFLOAD
 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
 static int free_eq(struct adapter *, struct sge_eq *);
 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
     struct sysctl_oid *);
 static int free_wrq(struct adapter *, struct sge_wrq *);
 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
     struct sysctl_oid *);
 static int free_txq(struct vi_info *, struct sge_txq *);
 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
 static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
 static void find_safe_refill_source(struct adapter *, struct sge_fl *);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
 static inline u_int txpkt_len16(u_int, u_int);
 static inline u_int txpkt_vm_len16(u_int, u_int);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
     struct mbuf *, u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
 static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
 static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
 static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
     struct mbuf *, const struct txpkts *, u_int);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
 static inline uint16_t read_hw_cidx(struct sge_eq *);
 static inline u_int reclaimable_tx_desc(struct sge_eq *);
 static inline u_int total_available_tx_desc(struct sge_eq *);
 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
 static void tx_reclaim(void *, int);
 static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
 static void wrq_tx_drain(void *, int);
 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 static int sysctl_tc(SYSCTL_HANDLER_ARGS);
 
 static counter_u64_t extfree_refs;
 static counter_u64_t extfree_rels;
 
 an_handler_t t4_an_handler;
 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
 
 
 static int
 an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl)
 {
 
 #ifdef INVARIANTS
 	panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
 #else
 	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n",
 	    __func__, iq, ctrl);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_an_handler(an_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	new = h ? (uintptr_t)h : (uintptr_t)an_not_handled;
 	loc = (uintptr_t *) &t4_an_handler;
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 static int
 fw_msg_not_handled(struct adapter *sc, const __be64 *rpl)
 {
 	const struct cpl_fw6_msg *cpl =
 	    __containerof(rpl, struct cpl_fw6_msg, data[0]);
 
 #ifdef INVARIANTS
 	panic("%s: fw_msg type %d", __func__, cpl->type);
 #else
 	log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	if (type >= nitems(t4_fw_msg_handler))
 		return (EINVAL);
 
 	/*
 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
 	 * handler dispatch table.  Reject any attempt to install a handler for
 	 * this subtype.
 	 */
 	if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL)
 		return (EINVAL);
 
 	new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled;
 	loc = (uintptr_t *) &t4_fw_msg_handler[type];
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 static int
 cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 
 #ifdef INVARIANTS
 	panic("%s: opcode 0x%02x on iq %p with payload %p",
 	    __func__, rss->opcode, iq, m);
 #else
 	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n",
 	    __func__, rss->opcode, iq, m);
 	m_freem(m);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_cpl_handler(int opcode, cpl_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	if (opcode >= nitems(t4_cpl_handler))
 		return (EINVAL);
 
 	new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled;
 	loc = (uintptr_t *) &t4_cpl_handler[opcode];
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
 	int i;
 
 	if (fl_pktshift < 0 || fl_pktshift > 7) {
 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
 		    " using 2 instead.\n", fl_pktshift);
 		fl_pktshift = 2;
 	}
 
 	if (spg_len != 64 && spg_len != 128) {
 		int len;
 
 #if defined(__i386__) || defined(__amd64__)
 		len = cpu_clflush_line_size > 64 ? 128 : 64;
 #else
 		len = 64;
 #endif
 		if (spg_len != -1) {
 			printf("Invalid hw.cxgbe.spg_len value (%d),"
 			    " using %d instead.\n", spg_len, len);
 		}
 		spg_len = len;
 	}
 
 	if (cong_drop < -1 || cong_drop > 1) {
 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
 		    " using 0 instead.\n", cong_drop);
 		cong_drop = 0;
 	}
 
 	extfree_refs = counter_u64_alloc(M_WAITOK);
 	extfree_rels = counter_u64_alloc(M_WAITOK);
 	counter_u64_zero(extfree_refs);
 	counter_u64_zero(extfree_rels);
 
 	t4_an_handler = an_not_handled;
 	for (i = 0; i < nitems(t4_fw_msg_handler); i++)
 		t4_fw_msg_handler[i] = fw_msg_not_handled;
 	for (i = 0; i < nitems(t4_cpl_handler); i++)
 		t4_cpl_handler[i] = cpl_not_handled;
 
 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 	t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx);
 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
 }
 
 void
 t4_sge_modunload(void)
 {
 
 	counter_u64_free(extfree_refs);
 	counter_u64_free(extfree_rels);
 }
 
 uint64_t
 t4_sge_extfree_refs(void)
 {
 	uint64_t refs, rels;
 
 	rels = counter_u64_fetch(extfree_rels);
 	refs = counter_u64_fetch(extfree_refs);
 
 	return (refs - rels);
 }
 
 static inline void
 setup_pad_and_pack_boundaries(struct adapter *sc)
 {
 	uint32_t v, m;
 	int pad, pack, pad_shift;
 
 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
 	    X_INGPADBOUNDARY_SHIFT;
 	pad = fl_pad;
 	if (fl_pad < (1 << pad_shift) ||
 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
 	    !powerof2(fl_pad)) {
 		/*
 		 * If there is any chance that we might use buffer packing and
 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
 		 * it to the minimum allowed in all other cases.
 		 */
 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
 
 		/*
 		 * For fl_pad = 0 we'll still write a reasonable value to the
 		 * register but all the freelists will opt out of padding.
 		 * We'll complain here only if the user tried to set it to a
 		 * value greater than 0 that was invalid.
 		 */
 		if (fl_pad > 0) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
 			    " (%d), using %d instead.\n", fl_pad, pad);
 		}
 	}
 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	if (is_t4(sc)) {
 		if (fl_pack != -1 && fl_pack != pad) {
 			/* Complain but carry on. */
 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
 			    " using %d instead.\n", fl_pack, pad);
 		}
 		return;
 	}
 
 	pack = fl_pack;
 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
 	    !powerof2(fl_pack)) {
 		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
 		MPASS(powerof2(pack));
 		if (pack < 16)
 			pack = 16;
 		if (pack == 32)
 			pack = 64;
 		if (pack > 4096)
 			pack = 4096;
 		if (fl_pack != -1) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
 			    " (%d), using %d instead.\n", fl_pack, pack);
 		}
 	}
 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
 	if (pack == 16)
 		v = V_INGPACKBOUNDARY(0);
 	else
 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
 
 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
 }
 
 /*
  * adap->params.vpd.cclk must be set up before this is called.
  */
 void
 t4_tweak_chip_settings(struct adapter *sc)
 {
 	int i;
 	uint32_t v, m;
 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sge_flbuf_sizes[] = {
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 		MJUMPAGESIZE - CL_METADATA_SIZE,
 		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES,
 		MCLBYTES - MSIZE - CL_METADATA_SIZE,
 		MJUM9BYTES - CL_METADATA_SIZE,
 		MJUM16BYTES - CL_METADATA_SIZE,
 	};
 
 	KASSERT(sc->flags & MASTER_PF,
 	    ("%s: trying to change chip settings when not master.", __func__));
 
 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	setup_pad_and_pack_boundaries(sc);
 
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
 	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
 	    ("%s: hw buffer size table too big", __func__));
 	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
 		    sge_flbuf_sizes[i]);
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
 
 	KASSERT(intr_timer[0] <= timer_max,
 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
 	    timer_max));
 	for (i = 1; i < nitems(intr_timer); i++) {
 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
 		    ("%s: timers not listed in increasing order (%d)",
 		    __func__, i));
 
 		while (intr_timer[i] > timer_max) {
 			if (i == nitems(intr_timer) - 1) {
 				intr_timer[i] = timer_max;
 				break;
 			}
 			intr_timer[i] += intr_timer[i - 1];
 			intr_timer[i] /= 2;
 		}
 	}
 
 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
 
 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
 
 	/*
 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
 	 * may have to deal with is MAXPHYS + 1 page.
 	 */
 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
 
 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
 }
 
 /*
  * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
  * padding is in use, the buffer's start and end need to be aligned to the pad
  * boundary as well.  We'll just make sure that the size is a multiple of the
  * boundary here, it is up to the buffer allocation code to make sure the start
  * of the buffer is aligned as well.
  */
 static inline int
 hwsz_ok(struct adapter *sc, int hwsz)
 {
 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
 
 	return (hwsz >= 64 && (hwsz & mask) == 0);
 }
 
 /*
  * XXX: driver really should be able to deal with unexpected settings.
  */
 int
 t4_read_chip_settings(struct adapter *sc)
 {
 	struct sge *s = &sc->sge;
 	struct sge_params *sp = &sc->params.sge;
 	int i, j, n, rc = 0;
 	uint32_t m, v, r;
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {	/* Sorted by size */
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 	struct sw_zone_info *swz, *safe_swz;
 	struct hw_buf_info *hwb;
 
 	m = F_RXPKTCPLMODE;
 	v = F_RXPKTCPLMODE;
 	r = sc->params.sge.sge_control;
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/*
 	 * If this changes then every single use of PAGE_SHIFT in the driver
 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
 	 */
 	if (sp->page_shift != PAGE_SHIFT) {
 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
 	hwb = &s->hw_buf_info[0];
 	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
 		r = sc->params.sge.sge_fl_buffer_size[i];
 		hwb->size = r;
 		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
 		hwb->next = -1;
 	}
 
 	/*
 	 * Create a sorted list in decreasing order of hw buffer sizes (and so
 	 * increasing order of spare area) for each software zone.
 	 *
 	 * If padding is enabled then the start and end of the buffer must align
 	 * to the pad boundary; if packing is enabled then they must align with
 	 * the pack boundary as well.  Allocations from the cluster zones are
 	 * aligned to min(size, 4K), so the buffer starts at that alignment and
 	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
 	 * starting alignment will be reduced to MSIZE and the driver will
 	 * exercise appropriate caution when deciding on the best buffer layout
 	 * to use.
 	 */
 	n = 0;	/* no usable buffer size to begin with */
 	swz = &s->sw_zone_info[0];
 	safe_swz = NULL;
 	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
 		int8_t head = -1, tail = -1;
 
 		swz->size = sw_buf_sizes[i];
 		swz->zone = m_getzone(swz->size);
 		swz->type = m_gettype(swz->size);
 
 		if (swz->size < PAGE_SIZE) {
 			MPASS(powerof2(swz->size));
 			if (fl_pad && (swz->size % sp->pad_boundary != 0))
 				continue;
 		}
 
 		if (swz->size == safest_rx_cluster)
 			safe_swz = swz;
 
 		hwb = &s->hw_buf_info[0];
 		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
 			if (hwb->zidx != -1 || hwb->size > swz->size)
 				continue;
 #ifdef INVARIANTS
 			if (fl_pad)
 				MPASS(hwb->size % sp->pad_boundary == 0);
 #endif
 			hwb->zidx = i;
 			if (head == -1)
 				head = tail = j;
 			else if (hwb->size < s->hw_buf_info[tail].size) {
 				s->hw_buf_info[tail].next = j;
 				tail = j;
 			} else {
 				int8_t *cur;
 				struct hw_buf_info *t;
 
 				for (cur = &head; *cur != -1; cur = &t->next) {
 					t = &s->hw_buf_info[*cur];
 					if (hwb->size == t->size) {
 						hwb->zidx = -2;
 						break;
 					}
 					if (hwb->size > t->size) {
 						hwb->next = *cur;
 						*cur = j;
 						break;
 					}
 				}
 			}
 		}
 		swz->head_hwidx = head;
 		swz->tail_hwidx = tail;
 
 		if (tail != -1) {
 			n++;
 			if (swz->size - s->hw_buf_info[tail].size >=
 			    CL_METADATA_SIZE)
 				sc->flags |= BUF_PACKING_OK;
 		}
 	}
 	if (n == 0) {
 		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
 		rc = EINVAL;
 	}
 
 	s->safe_hwidx1 = -1;
 	s->safe_hwidx2 = -1;
 	if (safe_swz != NULL) {
 		s->safe_hwidx1 = safe_swz->head_hwidx;
 		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
 			int spare;
 
 			hwb = &s->hw_buf_info[i];
 #ifdef INVARIANTS
 			if (fl_pad)
 				MPASS(hwb->size % sp->pad_boundary == 0);
 #endif
 			spare = safe_swz->size - hwb->size;
 			if (spare >= CL_METADATA_SIZE) {
 				s->safe_hwidx2 = i;
 				break;
 			}
 		}
 	}
 
 	if (sc->flags & IS_VF)
 		return (0);
 
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
 	if (r != v) {
 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = v = F_TDDPTAGTCB;
 	r = t4_read_reg(sc, A_ULP_RX_CTL);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	r = t4_read_reg(sc, A_TP_PARA_REG5);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	t4_init_tp_params(sc);
 
 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
 
 	return (rc);
 }
 
 int
 t4_create_dma_tag(struct adapter *sc)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
 	    NULL, &sc->dmat);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create main DMA tag: %d\n", rc);
 	}
 
 	return (rc);
 }
 
 void
 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *children)
 {
 	struct sge_params *sp = &sc->params.sge;
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
 	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
 	    "freelist buffer sizes");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
 	    NULL, sp->spg_len, "status page size (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
 	    NULL, cong_drop, "congestion drop setting");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
 t4_destroy_dma_tag(struct adapter *sc)
 {
 	if (sc->dmat)
 		bus_dma_tag_destroy(sc->dmat);
 
 	return (0);
 }
 
 /*
  * Allocate and initialize the firmware event queue and the management queue.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  */
 int
 t4_setup_adapter_queues(struct adapter *sc)
 {
 	int rc;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	sysctl_ctx_init(&sc->ctx);
 	sc->flags |= ADAP_SYSCTL_CTX;
 
 	/*
 	 * Firmware event queue
 	 */
 	rc = alloc_fwq(sc);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * Management queue.  This is just a control queue that uses the fwq as
 	 * its associated iq.
 	 */
 	if (!(sc->flags & IS_VF))
 		rc = alloc_mgmtq(sc);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_adapter_queues(struct adapter *sc)
 {
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	/* Do this before freeing the queue */
 	if (sc->flags & ADAP_SYSCTL_CTX) {
 		sysctl_ctx_free(&sc->ctx);
 		sc->flags &= ~ADAP_SYSCTL_CTX;
 	}
 
 	free_mgmtq(sc);
 	free_fwq(sc);
 
 	return (0);
 }
 
 static inline int
 first_vector(struct vi_info *vi)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (sc->intr_count == 1)
 		return (0);
 
 	return (vi->first_intr);
 }
 
 /*
  * Given an arbitrary "index," come up with an iq that can be used by other
  * queues (of this VI) for interrupt forwarding, SGE egress updates, etc.
  * The iq returned is guaranteed to be something that takes direct interrupts.
  */
 static struct sge_iq *
 vi_intr_iq(struct vi_info *vi, int idx)
 {
 	struct adapter *sc = vi->pi->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_iq *iq = NULL;
 	int nintr, i;
 
 	if (sc->intr_count == 1)
 		return (&sc->sge.fwq);
 
 	nintr = vi->nintr;
 	KASSERT(nintr != 0,
 	    ("%s: vi %p has no exclusive interrupts, total interrupts = %d",
 	    __func__, vi, sc->intr_count));
 	i = idx % nintr;
 
 	if (vi->flags & INTR_RXQ) {
 	       	if (i < vi->nrxq) {
 			iq = &s->rxq[vi->first_rxq + i].iq;
 			goto done;
 		}
 		i -= vi->nrxq;
 	}
 #ifdef TCP_OFFLOAD
 	if (vi->flags & INTR_OFLD_RXQ) {
 	       	if (i < vi->nofldrxq) {
 			iq = &s->ofld_rxq[vi->first_ofld_rxq + i].iq;
 			goto done;
 		}
 		i -= vi->nofldrxq;
 	}
 #endif
 	panic("%s: vi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__,
 	    vi, vi->flags & INTR_ALL, idx, nintr);
 done:
 	MPASS(iq != NULL);
 	KASSERT(iq->flags & IQ_INTR,
 	    ("%s: iq %p (vi %p, intr_flags 0x%lx, idx %d)", __func__, iq, vi,
 	    vi->flags & INTR_ALL, idx));
 	return (iq);
 }
 
 /* Maximum payload that can be delivered with a single iq descriptor */
 static inline int
 mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
 {
 	int payload;
 
 #ifdef TCP_OFFLOAD
 	if (toe) {
 		payload = sc->tt.rx_coalesce ?
 		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
 	} else {
 #endif
 		/* large enough even when hw VLAN extraction is disabled */
 		payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
 		    ETHER_VLAN_ENCAP_LEN + mtu;
 #ifdef TCP_OFFLOAD
 	}
 #endif
 
 	return (payload);
 }
 
 int
 t4_setup_vi_queues(struct vi_info *vi)
 {
 	int rc = 0, i, j, intr_idx, iqid;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 	struct sge_wrq *ctrlq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	int saved_idx;
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 	char name[16];
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 	int maxp, mtu = ifp->if_mtu;
 
 	/* Interrupt vector to start from (when using multiple vectors) */
 	intr_idx = first_vector(vi);
 
 #ifdef DEV_NETMAP
 	saved_idx = intr_idx;
 	if (ifp->if_capabilities & IFCAP_NETMAP) {
 
 		/* netmap is supported with direct interrupts only. */
 		MPASS(vi->flags & INTR_RXQ);
 
 		/*
 		 * We don't have buffers to back the netmap rx queues
 		 * right now so we create the queues in a way that
 		 * doesn't set off any congestion signal in the chip.
 		 */
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
 		    CTLFLAG_RD, NULL, "rx queues");
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
 		    CTLFLAG_RD, NULL, "tx queues");
 		for_each_nm_txq(vi, i, nm_txq) {
 			iqid = vi->first_nm_rxq + (i % vi->nnmrxq);
 			rc = alloc_nm_txq(vi, nm_txq, iqid, i, oid);
 			if (rc != 0)
 				goto done;
 		}
 	}
 
 	/* Normal rx queues and netmap rx queues share the same interrupts. */
 	intr_idx = saved_idx;
 #endif
 
 	/*
 	 * First pass over all NIC and TOE rx queues:
 	 * a) initialize iq and fl
 	 * b) allocate queue iff it will take direct interrupts.
 	 */
 	maxp = mtu_to_max_payload(sc, mtu, 0);
 	if (vi->flags & INTR_RXQ) {
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
 		    CTLFLAG_RD, NULL, "rx queues");
 	}
 	for_each_rxq(vi, i, rxq) {
 
 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		if (vi->flags & INTR_RXQ) {
 			rxq->iq.flags |= IQ_INTR;
 			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 	}
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
 #endif
 #ifdef TCP_OFFLOAD
 	maxp = mtu_to_max_payload(sc, mtu, 1);
 	if (vi->flags & INTR_OFLD_RXQ) {
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
 		    CTLFLAG_RD, NULL,
 		    "rx queues for offloaded TCP connections");
 	}
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 
 		init_iq(&ofld_rxq->iq, sc, vi->tmr_idx, vi->pktc_idx,
 		    vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		if (vi->flags & INTR_OFLD_RXQ) {
 			ofld_rxq->iq.flags |= IQ_INTR;
 			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 	}
 #endif
 
 	/*
 	 * Second pass over all NIC and TOE rx queues.  The queues forwarding
 	 * their interrupts are allocated now.
 	 */
 	j = 0;
 	if (!(vi->flags & INTR_RXQ)) {
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
 		    CTLFLAG_RD, NULL, "rx queues");
 		for_each_rxq(vi, i, rxq) {
 			MPASS(!(rxq->iq.flags & IQ_INTR));
 
 			intr_idx = vi_intr_iq(vi, j)->abs_id;
 
 			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			j++;
 		}
 	}
 #ifdef TCP_OFFLOAD
 	if (vi->nofldrxq != 0 && !(vi->flags & INTR_OFLD_RXQ)) {
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
 		    CTLFLAG_RD, NULL,
 		    "rx queues for offloaded TCP connections");
 		for_each_ofld_rxq(vi, i, ofld_rxq) {
 			MPASS(!(ofld_rxq->iq.flags & IQ_INTR));
 
 			intr_idx = vi_intr_iq(vi, j)->abs_id;
 
 			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			j++;
 		}
 	}
 #endif
 
 	/*
 	 * Now the tx queues.  Only one pass needed.
 	 */
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
 	    NULL, "tx queues");
 	j = 0;
 	for_each_txq(vi, i, txq) {
 		iqid = vi_intr_iq(vi, j)->cntxt_id;
 		snprintf(name, sizeof(name), "%s txq%d",
 		    device_get_nameunit(vi->dev), i);
 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, iqid,
 		    name);
 
 		rc = alloc_txq(vi, txq, i, oid);
 		if (rc != 0)
 			goto done;
 		j++;
 	}
 #ifdef TCP_OFFLOAD
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		struct sysctl_oid *oid2;
 
 		iqid = vi_intr_iq(vi, j)->cntxt_id;
 		snprintf(name, sizeof(name), "%s ofld_txq%d",
 		    device_get_nameunit(vi->dev), i);
 		init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
 		    iqid, name);
 
 		snprintf(name, sizeof(name), "%d", i);
 		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    name, CTLFLAG_RD, NULL, "offload tx queue");
 
 		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
 		if (rc != 0)
 			goto done;
 		j++;
 	}
 #endif
 
 	/*
 	 * Finally, the control queue.
 	 */
 	if (!IS_MAIN_VI(vi) || sc->flags & IS_VF)
 		goto done;
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
 	    NULL, "ctrl queue");
 	ctrlq = &sc->sge.ctrlq[pi->port_id];
 	iqid = vi_intr_iq(vi, 0)->cntxt_id;
 	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev));
 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid,
 	    name);
 	rc = alloc_wrq(sc, vi, ctrlq, oid);
 
 done:
 	if (rc)
 		t4_teardown_vi_queues(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_vi_queues(struct vi_info *vi)
 {
 	int i;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 
 	/* Do this before freeing the queues */
 	if (vi->flags & VI_SYSCTL_CTX) {
 		sysctl_ctx_free(&vi->ctx);
 		vi->flags &= ~VI_SYSCTL_CTX;
 	}
 
 #ifdef DEV_NETMAP
 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
 		for_each_nm_txq(vi, i, nm_txq) {
 			free_nm_txq(vi, nm_txq);
 		}
 
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			free_nm_rxq(vi, nm_rxq);
 		}
 	}
 #endif
 
 	/*
 	 * Take down all the tx queues first, as they reference the rx queues
 	 * (for egress updates, etc.).
 	 */
 
 	if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF))
 		free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
 
 	for_each_txq(vi, i, txq) {
 		free_txq(vi, txq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		free_wrq(sc, ofld_txq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues that forward their interrupts, as they
 	 * reference other rx queues.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		if ((rxq->iq.flags & IQ_INTR) == 0)
 			free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
 			free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues that take direct interrupts.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		if (rxq->iq.flags & IQ_INTR)
 			free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		if (ofld_rxq->iq.flags & IQ_INTR)
 			free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Deals with errors and the firmware event queue.  All data rx queues forward
  * their interrupt to the firmware event queue.
  */
 void
 t4_intr_all(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_iq *fwq = &sc->sge.fwq;
 
 	t4_intr_err(arg);
 	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(fwq, 0);
 		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 /* Deals with error interrupts */
 void
 t4_intr_err(void *arg)
 {
 	struct adapter *sc = arg;
 
 	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
 	t4_slow_intr_handler(sc);
 }
 
 void
 t4_intr_evt(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 void
 t4_intr(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 void
 t4_vi_intr(void *arg)
 {
 	struct irq *irq = arg;
 
 #ifdef DEV_NETMAP
 	if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) {
 		t4_nm_intr(irq->nm_rxq);
 		atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON);
 	}
 #endif
 	if (irq->rxq != NULL)
 		t4_intr(irq->rxq);
 }
 
 /*
  * Deals with anything and everything on the given ingress queue.
  */
 static int
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
 	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
 	struct sge_fl *fl;			/* Use iff IQ_HAS_FL */
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type, refill;
 	uint32_t lq;
 	uint16_t fl_hw_cidx;
 	struct mbuf *m0;
 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
 #if defined(INET) || defined(INET6)
 	const struct timeval lro_timeout = {0, sc->lro_timeout};
 #endif
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 
 	limit = budget ? budget : iq->qsize / 16;
 
 	if (iq->flags & IQ_HAS_FL) {
 		fl = &rxq->fl;
 		fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
 	} else {
 		fl = NULL;
 		fl_hw_cidx = 0;			/* to silence gcc warning */
 	}
 
 	/*
 	 * We always come back and check the descriptor ring for new indirect
 	 * interrupts and other responses after running a single handler.
 	 */
 	for (;;) {
 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 			rmb();
 
 			refill = 0;
 			m0 = NULL;
 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 			lq = be32toh(d->rsp.pldbuflen_qid);
 
 			switch (rsp_type) {
 			case X_RSPD_TYPE_FLBUF:
 
 				KASSERT(iq->flags & IQ_HAS_FL,
 				    ("%s: data for an iq (%p) with no freelist",
 				    __func__, iq));
 
 				m0 = get_fl_payload(sc, fl, lq);
 				if (__predict_false(m0 == NULL))
 					goto process_iql;
 				refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
 #ifdef T4_PKT_TIMESTAMP
 				/*
 				 * 60 bit timestamp for the payload is
 				 * *(uint64_t *)m0->m_pktdat.  Note that it is
 				 * in the leading free-space in the mbuf.  The
 				 * kernel can clobber it during a pullup,
 				 * m_copymdata, etc.  You need to make sure that
 				 * the mbuf reaches you unmolested if you care
 				 * about the timestamp.
 				 */
 				*(uint64_t *)m0->m_pktdat =
 				    be64toh(ctrl->u.last_flit) &
 				    0xfffffffffffffff;
 #endif
 
 				/* fall through */
 
 			case X_RSPD_TYPE_CPL:
 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 				    ("%s: bad opcode %02x.", __func__,
 				    d->rss.opcode));
 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
 				break;
 
 			case X_RSPD_TYPE_INTR:
 
 				/*
 				 * Interrupts should be forwarded only to queues
 				 * that are not forwarding their interrupts.
 				 * This means service_iq can recurse but only 1
 				 * level deep.
 				 */
 				KASSERT(budget == 0,
 				    ("%s: budget %u, rsp_type %u", __func__,
 				    budget, rsp_type));
 
 				/*
 				 * There are 1K interrupt-capable queues (qids 0
 				 * through 1023).  A response type indicating a
 				 * forwarded interrupt with a qid >= 1K is an
 				 * iWARP async notification.
 				 */
 				if (lq >= 1024) {
                                         t4_an_handler(iq, &d->rsp);
                                         break;
                                 }
 
 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
 				    sc->sge.iq_base];
 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
 				    IQS_BUSY)) {
 					if (service_iq(q, q->qsize / 16) == 0) {
 						atomic_cmpset_int(&q->state,
 						    IQS_BUSY, IQS_IDLE);
 					} else {
 						STAILQ_INSERT_TAIL(&iql, q,
 						    link);
 					}
 				}
 				break;
 
 			default:
 				KASSERT(0,
 				    ("%s: illegal response type %d on iq %p",
 				    __func__, rsp_type, iq));
 				log(LOG_ERR,
 				    "%s: illegal response type %d on iq %p",
 				    device_get_nameunit(sc->dev), rsp_type, iq);
 				break;
 			}
 
 			d++;
 			if (__predict_false(++iq->cidx == iq->sidx)) {
 				iq->cidx = 0;
 				iq->gen ^= F_RSPD_GEN;
 				d = &iq->desc[0];
 			}
 			if (__predict_false(++ndescs == limit)) {
 				t4_write_reg(sc, sc->sge_gts_reg,
 				    V_CIDXINC(ndescs) |
 				    V_INGRESSQID(iq->cntxt_id) |
 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 				ndescs = 0;
 
 #if defined(INET) || defined(INET6)
 				if (iq->flags & IQ_LRO_ENABLED &&
 				    sc->lro_timeout != 0) {
 					tcp_lro_flush_inactive(&rxq->lro,
 					    &lro_timeout);
 				}
 #endif
 
 				if (budget) {
 					if (iq->flags & IQ_HAS_FL) {
 						FL_LOCK(fl);
 						refill_fl(sc, fl, 32);
 						FL_UNLOCK(fl);
 					}
 					return (EINPROGRESS);
 				}
 			}
 			if (refill) {
 				FL_LOCK(fl);
 				refill_fl(sc, fl, 32);
 				FL_UNLOCK(fl);
 				fl_hw_cidx = fl->hw_cidx;
 			}
 		}
 
 process_iql:
 		if (STAILQ_EMPTY(&iql))
 			break;
 
 		/*
 		 * Process the head only, and send it to the back of the list if
 		 * it's still not done.
 		 */
 		q = STAILQ_FIRST(&iql);
 		STAILQ_REMOVE_HEAD(&iql, link);
 		if (service_iq(q, q->qsize / 8) == 0)
 			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
 		else
 			STAILQ_INSERT_TAIL(&iql, q, link);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		struct lro_ctrl *lro = &rxq->lro;
 
 		tcp_lro_flush_all(lro);
 	}
 #endif
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	if (iq->flags & IQ_HAS_FL) {
 		int starved;
 
 		FL_LOCK(fl);
 		starved = refill_fl(sc, fl, 64);
 		FL_UNLOCK(fl);
 		if (__predict_false(starved != 0))
 			add_fl_to_sfl(sc, fl);
 	}
 
 	return (0);
 }
 
 static inline int
 cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
 {
 	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
 
 	if (rc)
 		MPASS(cll->region3 >= CL_METADATA_SIZE);
 
 	return (rc);
 }
 
 static inline struct cluster_metadata *
 cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
     caddr_t cl)
 {
 
 	if (cl_has_metadata(fl, cll)) {
 		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 
 		return ((struct cluster_metadata *)(cl + swz->size) - 1);
 	}
 	return (NULL);
 }
 
 static void
 rxb_free(struct mbuf *m, void *arg1, void *arg2)
 {
 	uma_zone_t zone = arg1;
 	caddr_t cl = arg2;
 
 	uma_zfree(zone, cl);
 	counter_u64_add(extfree_rels, 1);
 }
 
 /*
  * The mbuf returned by this function could be allocated from zone_mbuf or
  * constructed in spare room in the cluster.
  *
  * The mbuf carries the payload in one of these ways
  * a) frame inside the mbuf (mbuf from zone_mbuf)
  * b) m_cljset (for clusters without metadata) zone_mbuf
  * c) m_extaddref (cluster with metadata) inline mbuf
  * d) m_extaddref (cluster with metadata) zone_mbuf
  */
 static struct mbuf *
 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct mbuf *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct cluster_layout *cll = &sd->cll;
 	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
 	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
 	int len, blen;
 	caddr_t payload;
 
 	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
 	len = min(remaining, blen);
 	payload = sd->cl + cll->region1 + fl->rx_offset;
 	if (fl->flags & FL_BUF_PACKING) {
 		const u_int l = fr_offset + len;
 		const u_int pad = roundup2(l, fl->buf_boundary) - l;
 
 		if (fl->rx_offset + len + pad < hwb->size)
 			blen = len + pad;
 		MPASS(fl->rx_offset + blen <= hwb->size);
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 	}
 
 
 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 
 		/*
 		 * Copy payload into a freshly allocated mbuf.
 		 */
 
 		m = fr_offset == 0 ?
 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (NULL);
 		fl->mbuf_allocated++;
 #ifdef T4_PKT_TIMESTAMP
 		/* Leave room for a timestamp */
 		m->m_data += 8;
 #endif
 		/* copy data to mbuf */
 		bcopy(payload, mtod(m, caddr_t), len);
 
 	} else if (sd->nmbuf * MSIZE < cll->region1) {
 
 		/*
 		 * There's spare room in the cluster for an mbuf.  Create one
 		 * and associate it with the payload that's in the cluster.
 		 */
 
 		MPASS(clm != NULL);
 		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
 		/* No bzero required */
 		if (m_init(m, M_NOWAIT, MT_DATA,
 		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
 			return (NULL);
 		fl->mbuf_inlined++;
 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
 		    swz->zone, sd->cl);
 		if (sd->nmbuf++ == 0)
 			counter_u64_add(extfree_refs, 1);
 
 	} else {
 
 		/*
 		 * Grab an mbuf from zone_mbuf and associate it with the
 		 * payload in the cluster.
 		 */
 
 		m = fr_offset == 0 ?
 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (NULL);
 		fl->mbuf_allocated++;
 		if (clm != NULL) {
 			m_extaddref(m, payload, blen, &clm->refcount,
 			    rxb_free, swz->zone, sd->cl);
 			if (sd->nmbuf++ == 0)
 				counter_u64_add(extfree_refs, 1);
 		} else {
 			m_cljset(m, sd->cl, swz->type);
 			sd->cl = NULL;	/* consumed, not a recycle candidate */
 		}
 	}
 	if (fr_offset == 0)
 		m->m_pkthdr.len = remaining;
 	m->m_len = len;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= hwb->size);
 		if (fl->rx_offset < hwb->size)
 			return (m);	/* without advancing the cidx */
 	}
 
 	if (__predict_false(++fl->cidx % 8 == 0)) {
 		uint16_t cidx = fl->cidx / 8;
 
 		if (__predict_false(cidx == fl->sidx))
 			fl->cidx = cidx = 0;
 		fl->hw_cidx = cidx;
 	}
 	fl->rx_offset = 0;
 
 	return (m);
 }
 
 static struct mbuf *
 get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
 {
 	struct mbuf *m0, *m, **pnext;
 	u_int remaining;
 	const u_int total = G_RSPD_LEN(len_newbuf);
 
 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
 		M_ASSERTPKTHDR(fl->m0);
 		MPASS(fl->m0->m_pkthdr.len == total);
 		MPASS(fl->remaining < total);
 
 		m0 = fl->m0;
 		pnext = fl->pnext;
 		remaining = fl->remaining;
 		fl->flags &= ~FL_BUF_RESUME;
 		goto get_segment;
 	}
 
 	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
 		fl->rx_offset = 0;
 		if (__predict_false(++fl->cidx % 8 == 0)) {
 			uint16_t cidx = fl->cidx / 8;
 
 			if (__predict_false(cidx == fl->sidx))
 				fl->cidx = cidx = 0;
 			fl->hw_cidx = cidx;
 		}
 	}
 
 	/*
 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
 	 * 'len' and it may span multiple hw buffers.
 	 */
 
 	m0 = get_scatter_segment(sc, fl, 0, total);
 	if (m0 == NULL)
 		return (NULL);
 	remaining = total - m0->m_len;
 	pnext = &m0->m_next;
 	while (remaining > 0) {
 get_segment:
 		MPASS(fl->rx_offset == 0);
 		m = get_scatter_segment(sc, fl, total - remaining, remaining);
 		if (__predict_false(m == NULL)) {
 			fl->m0 = m0;
 			fl->pnext = pnext;
 			fl->remaining = remaining;
 			fl->flags |= FL_BUF_RESUME;
 			return (NULL);
 		}
 		*pnext = m;
 		pnext = &m->m_next;
 		remaining -= m->m_len;
 	}
 	*pnext = NULL;
 
 	M_ASSERTPKTHDR(m0);
 	return (m0);
 }
 
 static int
 t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct ifnet *ifp = rxq->ifp;
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
 	};
 
 	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
 	m0->m_len -= sc->params.sge.fl_pktshift;
 	m0->m_data += sc->params.sge.fl_pktshift;
 
 	m0->m_pkthdr.rcvif = ifp;
 	M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]);
 	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
 
 	if (cpl->csum_calc && !cpl->err_vec) {
 		if (ifp->if_capenable & IFCAP_RXCSUM &&
 		    cpl->l2info & htobe32(F_RXF_IP)) {
 			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
 		    cpl->l2info & htobe32(F_RXF_IP6)) {
 			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
 			    CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		}
 
 		if (__predict_false(cpl->ip_frag))
 			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
 		else
 			m0->m_pkthdr.csum_data = 0xffff;
 	}
 
 	if (cpl->vlan_ex) {
 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
 		m0->m_flags |= M_VLANTAG;
 		rxq->vlan_extraction++;
 	}
 
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED &&
 	    tcp_lro_rx(lro, m0, 0) == 0) {
 		/* queued for LRO */
 	} else
 #endif
 	ifp->if_input(ifp, m0);
 
 	return (0);
 }
 
 /*
  * Must drain the wrq or make sure that someone else will.
  */
 static void
 wrq_tx_drain(void *arg, int n)
 {
 	struct sge_wrq *wrq = arg;
 	struct sge_eq *eq = &wrq->eq;
 
 	EQ_LOCK(eq);
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(wrq->adapter, wrq);
 	EQ_UNLOCK(eq);
 }
 
 static void
 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
 {
 	struct sge_eq *eq = &wrq->eq;
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n;
 	struct wrqe *wr;
 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
 	wr = STAILQ_FIRST(&wrq->wr_list);
 	MPASS(wr != NULL);	/* Must be called with something useful to do */
 	MPASS(eq->pidx == eq->dbidx);
 	dbdiff = 0;
 
 	do {
 		eq->cidx = read_hw_cidx(eq);
 		if (eq->pidx == eq->cidx)
 			available = eq->sidx - 1;
 		else
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
 		MPASS(wr->wrq == wrq);
 		n = howmany(wr->wr_len, EQ_ESIZE);
 		if (available < n)
 			break;
 
 		dst = (void *)&eq->desc[eq->pidx];
 		if (__predict_true(eq->sidx - eq->pidx > n)) {
 			/* Won't wrap, won't end exactly at the status page. */
 			bcopy(&wr->wr[0], dst, wr->wr_len);
 			eq->pidx += n;
 		} else {
 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
 
 			bcopy(&wr->wr[0], dst, first_portion);
 			if (wr->wr_len > first_portion) {
 				bcopy(&wr->wr[first_portion], &eq->desc[0],
 				    wr->wr_len - first_portion);
 			}
 			eq->pidx = n - (eq->sidx - eq->pidx);
 		}
 		wrq->tx_wrs_copied++;
 
 		if (available < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 			eq->equeqidx = eq->pidx;
 		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 			eq->equeqidx = eq->pidx;
 		}
 
 		dbdiff += n;
 		if (dbdiff >= 16) {
 			ring_eq_db(sc, eq, dbdiff);
 			dbdiff = 0;
 		}
 
 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
 		free_wrqe(wr);
 		MPASS(wrq->nwr_pending > 0);
 		wrq->nwr_pending--;
 		MPASS(wrq->ndesc_needed >= n);
 		wrq->ndesc_needed -= n;
 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
 
 	if (dbdiff)
 		ring_eq_db(sc, eq, dbdiff);
 }
 
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 #ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
 #endif
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(wr != NULL);
 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
 	MPASS((wr->wr_len & 0x7) == 0);
 
 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 	wrq->nwr_pending++;
 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
 
 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
 		return;	/* commit_wrq_wr will drain wr_list as well. */
 
 	drain_wrq_wr_list(sc, wrq);
 
 	/* Doorbell must have caught up to the pidx. */
 	MPASS(eq->pidx == eq->dbidx);
 }
 
 void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 	struct sge_fl *fl;
 	int i, maxp, mtu = ifp->if_mtu;
 
 	maxp = mtu_to_max_payload(sc, mtu, 0);
 	for_each_rxq(vi, i, rxq) {
 		fl = &rxq->fl;
 
 		FL_LOCK(fl);
 		find_best_refill_source(sc, fl, maxp);
 		FL_UNLOCK(fl);
 	}
 #ifdef TCP_OFFLOAD
 	maxp = mtu_to_max_payload(sc, mtu, 1);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		fl = &ofld_rxq->fl;
 
 		FL_LOCK(fl);
 		find_best_refill_source(sc, fl, maxp);
 		FL_UNLOCK(fl);
 	}
 #endif
 }
 
 static inline int
 mbuf_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.l5hlen > 0,
 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
 	return (m->m_pkthdr.l5hlen);
 }
 
 static inline void
 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.l5hlen = nsegs;
 }
 
 static inline int
 mbuf_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[0];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[0] = len16;
 }
 
 static inline int
 needs_tso(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 		KASSERT(m->m_pkthdr.tso_segsz > 0,
 		    ("%s: TSO requested in mbuf %p but MSS not provided",
 		    __func__, m));
 		return (1);
 	}
 
 	return (0);
 }
 
 static inline int
 needs_l3_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
 		return (1);
 	return (0);
 }
 
 static inline int
 needs_l4_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
 	    CSUM_TCP_IPV6 | CSUM_TSO))
 		return (1);
 	return (0);
 }
 
 static inline int
 needs_vlan_insertion(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_VLANTAG) {
 		KASSERT(m->m_pkthdr.ether_vtag != 0,
 		    ("%s: HWVLAN requested in mbuf %p but tag not provided",
 		    __func__, m));
 		return (1);
 	}
 	return (0);
 }
 
 static void *
 m_advance(struct mbuf **pm, int *poffset, int len)
 {
 	struct mbuf *m = *pm;
 	int offset = *poffset;
 	uintptr_t p = 0;
 
 	MPASS(len > 0);
 
 	for (;;) {
 		if (offset + len < m->m_len) {
 			offset += len;
 			p = mtod(m, uintptr_t) + offset;
 			break;
 		}
 		len -= m->m_len - offset;
 		m = m->m_next;
 		offset = 0;
 		MPASS(m != NULL);
 	}
 	*poffset = offset;
 	*pm = m;
 	return ((void *)p);
 }
 
-static inline int
-same_paddr(char *a, char *b)
-{
-
-	if (a == b)
-		return (1);
-	else if (a != NULL && b != NULL) {
-		vm_offset_t x = (vm_offset_t)a;
-		vm_offset_t y = (vm_offset_t)b;
-
-		if ((x & PAGE_MASK) == (y & PAGE_MASK) &&
-		    pmap_kextract(x) == pmap_kextract(y))
-			return (1);
-	}
-
-	return (0);
-}
-
 /*
  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
  * must have at least one mbuf that's not empty.
  */
 static inline int
 count_mbuf_nsegs(struct mbuf *m)
 {
-	char *prev_end, *start;
+	vm_paddr_t lastb, next;
+	vm_offset_t va;
 	int len, nsegs;
 
 	MPASS(m != NULL);
 
 	nsegs = 0;
-	prev_end = NULL;
+	lastb = 0;
 	for (; m; m = m->m_next) {
 
 		len = m->m_len;
 		if (__predict_false(len == 0))
 			continue;
-		start = mtod(m, char *);
-
-		nsegs += sglist_count(start, len);
-		if (same_paddr(prev_end, start))
+		va = mtod(m, vm_offset_t);
+		next = pmap_kextract(va);
+		nsegs += sglist_count(m->m_data, len);
+		if (lastb + 1 == next)
 			nsegs--;
-		prev_end = start + len;
+		lastb = pmap_kextract(va + len - 1);
 	}
 
 	MPASS(nsegs > 0);
 	return (nsegs);
 }
 
 /*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
  */
 int
 parse_pkt(struct adapter *sc, struct mbuf **mp)
 {
 	struct mbuf *m0 = *mp, *m;
 	int rc, nsegs, defragged = 0, offset;
 	struct ether_header *eh;
 	void *l3hdr;
 #if defined(INET) || defined(INET6)
 	struct tcphdr *tcp;
 #endif
 	uint16_t eh_type;
 
 	M_ASSERTPKTHDR(m0);
 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
 		rc = EINVAL;
 fail:
 		m_freem(m0);
 		*mp = NULL;
 		return (rc);
 	}
 restart:
 	/*
 	 * First count the number of gather list segments in the payload.
 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
 	 */
 	M_ASSERTPKTHDR(m0);
 	MPASS(m0->m_pkthdr.len > 0);
 	nsegs = count_mbuf_nsegs(m0);
 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0 = m;	/* update caller's copy after defrag */
 		goto restart;
 	}
 
 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
 		m0 = m_pullup(m0, m0->m_pkthdr.len);
 		if (m0 == NULL) {
 			/* Should have left well enough alone. */
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0;	/* update caller's copy after pullup */
 		goto restart;
 	}
 	set_mbuf_nsegs(m0, nsegs);
 	if (sc->flags & IS_VF)
 		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
 	else
 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
 	if (!needs_tso(m0) &&
 	    !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0))))
 		return (0);
 
 	m = m0;
 	eh = mtod(m, struct ether_header *);
 	eh_type = ntohs(eh->ether_type);
 	if (eh_type == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evh = (void *)eh;
 
 		eh_type = ntohs(evh->evl_proto);
 		m0->m_pkthdr.l2hlen = sizeof(*evh);
 	} else
 		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
 	offset = 0;
 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
 	switch (eh_type) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = l3hdr;
 
 		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
 
 		m0->m_pkthdr.l3hlen = sizeof(*ip6);
 		break;
 	}
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = l3hdr;
 
 		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
 		break;
 	}
 #endif
 	default:
 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
 		    " with the same INET/INET6 options as the kernel.",
 		    __func__, eh_type);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (needs_tso(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 	}
 #endif
 	MPASS(m0 == *mp);
 	return (0);
 }
 
 void *
 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, available;
 	struct wrqe *wr;
 	void *w;
 
 	MPASS(len16 > 0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
 	EQ_LOCK(eq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
 slowpath:
 		EQ_UNLOCK(eq);
 		wr = alloc_wrqe(len16 * 16, wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 		cookie->pidx = -1;
 		cookie->ndesc = ndesc;
 		return (&wr->wr);
 	}
 
 	eq->cidx = read_hw_cidx(eq);
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	if (available < ndesc)
 		goto slowpath;
 
 	cookie->pidx = eq->pidx;
 	cookie->ndesc = ndesc;
 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
 
 	w = &eq->desc[eq->pidx];
 	IDXINCR(eq->pidx, ndesc, eq->sidx);
 	if (__predict_false(eq->pidx < ndesc - 1)) {
 		w = &wrq->ss[0];
 		wrq->ss_pidx = cookie->pidx;
 		wrq->ss_len = len16 * 16;
 	}
 
 	EQ_UNLOCK(eq);
 
 	return (w);
 }
 
 void
 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, pidx;
 	struct wrq_cookie *prev, *next;
 
 	if (cookie->pidx == -1) {
 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
 		t4_wrq_tx(sc, wr);
 		return;
 	}
 
 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
 	pidx = cookie->pidx;
 	MPASS(pidx >= 0 && pidx < eq->sidx);
 	if (__predict_false(w == &wrq->ss[0])) {
 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
 
 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
 		wrq->tx_wrs_ss++;
 	} else
 		wrq->tx_wrs_direct++;
 
 	EQ_LOCK(eq);
 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
 	next = TAILQ_NEXT(cookie, link);
 	if (prev == NULL) {
 		MPASS(pidx == eq->dbidx);
 		if (next == NULL || ndesc >= 16)
 			ring_eq_db(wrq->adapter, eq, ndesc);
 		else {
 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
 			next->pidx = pidx;
 			next->ndesc += ndesc;
 		}
 	} else {
 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
 		prev->ndesc += ndesc;
 	}
 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
 		/* Doorbell must have caught up to the pidx. */
 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
 	EQ_UNLOCK(eq);
 }
 
 static u_int
 can_resume_eth_tx(struct mp_ring *r)
 {
 	struct sge_eq *eq = r->cookie;
 
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
 static inline int
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
 
 	return (needs_tso(m));
 }
 
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
 {
 	struct sge_txq *txq = r->cookie;
 	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	u_int total, remaining;		/* # of packets */
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n, next_cidx;
 	struct mbuf *m0, *tail;
 	struct txpkts txp;
 	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
 
 	remaining = IDXDIFF(pidx, cidx, r->size);
 	MPASS(remaining > 0);	/* Must not be called without work to do. */
 	total = 0;
 
 	TXQ_LOCK(txq);
 	if (__predict_false((eq->flags & EQ_ENABLED) == 0)) {
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
 		reclaim_tx_descs(txq, 2048);
 		total = remaining;
 		goto done;
 	}
 
 	/* How many hardware descriptors do we have readily available. */
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
 
 	while (remaining > 0) {
 
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
 		if (available < SGE_MAX_WR_NDESC) {
 			available += reclaim_tx_descs(txq, 64);
 			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
 				break;	/* out of descriptors */
 		}
 
 		next_cidx = cidx + 1;
 		if (__predict_false(next_cidx == r->size))
 			next_cidx = 0;
 
 		wr = (void *)&eq->desc[eq->pidx];
 		if (sc->flags & IS_VF) {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
 			    available);
 		} else if (remaining > 1 &&
 		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
 
 			/* pkts at cidx, next_cidx should both be in txp. */
 			MPASS(txp.npkt == 2);
 			tail = r->items[next_cidx];
 			MPASS(tail->m_nextpkt == NULL);
 			ETHER_BPF_MTAP(ifp, m0);
 			ETHER_BPF_MTAP(ifp, tail);
 			m0->m_nextpkt = tail;
 
 			if (__predict_false(++next_cidx == r->size))
 				next_cidx = 0;
 
 			while (next_cidx != pidx) {
 				if (add_to_txpkts(r->items[next_cidx], &txp,
 				    available) != 0)
 					break;
 				tail->m_nextpkt = r->items[next_cidx];
 				tail = tail->m_nextpkt;
 				ETHER_BPF_MTAP(ifp, tail);
 				if (__predict_false(++next_cidx == r->size))
 					next_cidx = 0;
 			}
 
 			n = write_txpkts_wr(txq, wr, m0, &txp, available);
 			total += txp.npkt;
 			remaining -= txp.npkt;
 		} else {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_wr(txq, (void *)wr, m0, available);
 		}
 		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
 
 		available -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);
 
 		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 			eq->equeqidx = eq->pidx;
 		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 			eq->equeqidx = eq->pidx;
 		}
 
 		if (dbdiff >= 16 && remaining >= 4) {
 			ring_eq_db(sc, eq, dbdiff);
 			available += reclaim_tx_descs(txq, 4 * dbdiff);
 			dbdiff = 0;
 		}
 
 		cidx = next_cidx;
 	}
 	if (dbdiff != 0) {
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
 	}
 done:
 	TXQ_UNLOCK(txq);
 
 	return (total);
 }
 
 static inline void
 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
     int qsize)
 {
 
 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
 
 	iq->flags = 0;
 	iq->adapter = sc;
 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
 	if (pktc_idx >= 0) {
 		iq->intr_params |= F_QINTR_CNT_EN;
 		iq->intr_pktc_idx = pktc_idx;
 	}
 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
 }
 
 static inline void
 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
 {
 
 	fl->qsize = qsize;
 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
 	if (sc->flags & BUF_PACKING_OK &&
 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
 		fl->flags |= FL_BUF_PACKING;
 	find_best_refill_source(sc, fl, maxp);
 	find_safe_refill_source(sc, fl);
 }
 
 static inline void
 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
     uint8_t tx_chan, uint16_t iqid, char *name)
 {
 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
 
 	eq->flags = eqtype & EQ_TYPEMASK;
 	eq->tx_chan = tx_chan;
 	eq->iqid = iqid;
 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
 }
 
 static int
 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_addr_t *pa, void **va)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamem_alloc(*tag, va,
 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
 		goto done;
 	}
 done:
 	if (rc)
 		free_ring(sc, *tag, *map, *pa, *va);
 
 	return (rc);
 }
 
 static int
 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
     bus_addr_t pa, void *va)
 {
 	if (pa)
 		bus_dmamap_unload(tag, map);
 	if (va)
 		bus_dmamem_free(tag, va, map);
 	if (tag)
 		bus_dma_tag_destroy(tag);
 
 	return (0);
 }
 
 /*
  * Allocates the ring for an ingress queue and an optional freelist.  If the
  * freelist is specified it will be allocated and then associated with the
  * ingress queue.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  *
  * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
  * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
  * the abs_id of the ingress queue to which its interrupts should be forwarded.
  */
 static int
 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
     int intr_idx, int cong)
 {
 	int rc, i, cntxt_id;
 	size_t len;
 	struct fw_iq_cmd c;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = iq->adapter;
 	struct sge_params *sp = &sc->params.sge;
 	__be32 v = 0;
 
 	len = iq->qsize * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
 	    (void **)&iq->desc);
 	if (rc != 0)
 		return (rc);
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
 	    V_FW_IQ_CMD_VFN(0));
 
 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
 	    FW_LEN16(c));
 
 	/* Special handling for firmware event queue */
 	if (iq == &sc->sge.fwq)
 		v |= F_FW_IQ_CMD_IQASYNCH;
 
 	if (iq->flags & IQ_INTR) {
 		KASSERT(intr_idx < sc->intr_count,
 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
 	} else
 		v |= F_FW_IQ_CMD_IQANDST;
 	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
 
 	c.type_to_iqandstindex = htobe32(v |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(vi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
 	    F_FW_IQ_CMD_IQGTSMODE |
 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
 	c.iqsize = htobe16(iq->qsize);
 	c.iqaddr = htobe64(iq->ba);
 	if (cong >= 0)
 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
 
 	if (fl) {
 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 
 		len = fl->qsize * EQ_ESIZE;
 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
 		    &fl->ba, (void **)&fl->desc);
 		if (rc)
 			return (rc);
 
 		/* Allocate space for one software descriptor per buffer. */
 		rc = alloc_fl_sdesc(fl);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to setup fl software descriptors: %d\n",
 			    rc);
 			return (rc);
 		}
 
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
 			fl->buf_boundary = sp->pack_boundary;
 		} else {
 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
 			fl->buf_boundary = 16;
 		}
 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
 			fl->buf_boundary = sp->pad_boundary;
 
 		c.iqns_to_fl0congen |=
 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
 			    0));
 		if (cong >= 0) {
 			c.iqns_to_fl0congen |=
 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
 				    F_FW_IQ_CMD_FL0CONGCIF |
 				    F_FW_IQ_CMD_FL0CONGEN);
 		}
 		c.fl0dcaen_to_fl0cidxfthresh =
 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) |
 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
 		c.fl0size = htobe16(fl->qsize);
 		c.fl0addr = htobe64(fl->ba);
 	}
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create ingress queue: %d\n", rc);
 		return (rc);
 	}
 
 	iq->cidx = 0;
 	iq->gen = F_RSPD_GEN;
 	iq->intr_next = iq->intr_params;
 	iq->cntxt_id = be16toh(c.iqid);
 	iq->abs_id = be16toh(c.physiqid);
 	iq->flags |= IQ_ALLOCATED;
 
 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
 	if (cntxt_id >= sc->sge.niq) {
 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
 		    cntxt_id, sc->sge.niq - 1);
 	}
 	sc->sge.iqmap[cntxt_id] = iq;
 
 	if (fl) {
 		u_int qid;
 
 		iq->flags |= IQ_HAS_FL;
 		fl->cntxt_id = be16toh(c.fl0id);
 		fl->pidx = fl->cidx = 0;
 
 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
 		if (cntxt_id >= sc->sge.neq) {
 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
 			    __func__, cntxt_id, sc->sge.neq - 1);
 		}
 		sc->sge.eqmap[cntxt_id] = (void *)fl;
 
 		qid = fl->cntxt_id;
 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 			uint32_t mask = (1 << s_qpp) - 1;
 			volatile uint8_t *udb;
 
 			udb = sc->udbs_base + UDBS_DB_OFFSET;
 			udb += (qid >> s_qpp) << PAGE_SHIFT;
 			qid &= mask;
 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
 				udb += qid << UDBS_SEG_SHIFT;
 				qid = 0;
 			}
 			fl->udb = (volatile void *)udb;
 		}
 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
 
 		FL_LOCK(fl);
 		/* Enough to make sure the SGE doesn't think it's starved */
 		refill_fl(sc, fl, fl->lowat);
 		FL_UNLOCK(fl);
 	}
 
 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
 		uint32_t param, val;
 
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
 		if (cong == 0)
 			val = 1 << 19;
 		else {
 			val = 2 << 19;
 			for (i = 0; i < 4; i++) {
 				if (cong & (1 << i))
 					val |= 1 << (i << 2);
 			}
 		}
 
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			device_printf(sc->dev,
 			    "failed to set congestion manager context for "
 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
 		}
 	}
 
 	/* Enable IQ interrupts */
 	atomic_store_rel_int(&iq->state, IQS_IDLE);
 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
 	    V_INGRESSQID(iq->cntxt_id));
 
 	return (0);
 }
 
 static int
 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
 {
 	int rc;
 	struct adapter *sc = iq->adapter;
 	device_t dev;
 
 	if (sc == NULL)
 		return (0);	/* nothing to do */
 
 	dev = vi ? vi->dev : sc->dev;
 
 	if (iq->flags & IQ_ALLOCATED) {
 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
 		if (rc != 0) {
 			device_printf(dev,
 			    "failed to free queue %p: %d\n", iq, rc);
 			return (rc);
 		}
 		iq->flags &= ~IQ_ALLOCATED;
 	}
 
 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
 
 	bzero(iq, sizeof(*iq));
 
 	if (fl) {
 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
 		    fl->desc);
 
 		if (fl->sdesc)
 			free_fl_sdesc(sc, fl);
 
 		if (mtx_initialized(&fl->fl_lock))
 			mtx_destroy(&fl->fl_lock);
 
 		bzero(fl, sizeof(*fl));
 	}
 
 	return (0);
 }
 
 static void
 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid, struct sge_fl *fl)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
 	    "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &fl->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
 	    fl_pad ? 1 : 0, "padding enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
 	    0, "consumer index");
 	if (fl->flags & FL_BUF_PACKING) {
 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
 	}
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
 	    0, "producer index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
 	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
 	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
 }
 
 static int
 alloc_fwq(struct adapter *sc)
 {
 	int rc, intr_idx;
 	struct sge_iq *fwq = &sc->sge.fwq;
 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
 	fwq->flags |= IQ_INTR;	/* always */
 	if (sc->flags & IS_VF)
 		intr_idx = 0;
 	else {
 		intr_idx = sc->intr_count > 1 ? 1 : 0;
 		fwq->set_tcb_rpl = t4_filter_rpl;
 		fwq->l2t_write_rpl = do_l2t_write_rpl;
 	}
 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create firmware event queue: %d\n", rc);
 		return (rc);
 	}
 
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
 	    NULL, "firmware event queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(&sc->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &fwq->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(&sc->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    fwq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I",
 	    "absolute id of the queue");
 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 
 	return (0);
 }
 
 static int
 free_fwq(struct adapter *sc)
 {
 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
 }
 
 static int
 alloc_mgmtq(struct adapter *sc)
 {
 	int rc;
 	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
 	char name[16];
 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
 	    NULL, "management queue");
 
 	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
 	init_eq(sc, &mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
 	    sc->sge.fwq.cntxt_id, name);
 	rc = alloc_wrq(sc, NULL, mgmtq, oid);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create management queue: %d\n", rc);
 		return (rc);
 	}
 
 	return (0);
 }
 
 static int
 free_mgmtq(struct adapter *sc)
 {
 
 	return free_wrq(sc, &sc->sge.mgmtq);
 }
 
 int
 tnl_cong(struct port_info *pi, int drop)
 {
 
 	if (drop == -1)
 		return (-1);
 	else if (drop == 1)
 		return (0);
 	else
 		return (pi->rx_chan_map);
 }
 
 static int
 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
 	    tnl_cong(vi->pi, cong_drop));
 	if (rc != 0)
 		return (rc);
 
 	if (idx == 0)
 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
 	else
 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
 		    ("iq_base mismatch"));
 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero iq_base"));
 
 	/*
 	 * The freelist is just barely above the starvation threshold right now,
 	 * fill it up a bit more.
 	 */
 	FL_LOCK(&rxq->fl);
 	refill_fl(sc, &rxq->fl, 128);
 	FL_UNLOCK(&rxq->fl);
 
 #if defined(INET) || defined(INET6)
 	rc = tcp_lro_init(&rxq->lro);
 	if (rc != 0)
 		return (rc);
 	rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */
 
 	if (vi->ifp->if_capenable & IFCAP_LRO)
 		rxq->iq.flags |= IQ_LRO_ENABLED;
 #endif
 	rxq->ifp = vi->ifp;
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &rxq->iq.ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    rxq->iq.qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I",
 	    "absolute id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 #if defined(INET) || defined(INET6)
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
 	    &rxq->lro.lro_queued, 0, NULL);
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
 	    CTLFLAG_RD, &rxq->vlan_extraction,
 	    "# of times hardware extracted 802.1Q tag");
 
 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
 {
 	int rc;
 
 #if defined(INET) || defined(INET6)
 	if (rxq->lro.ifp) {
 		tcp_lro_free(&rxq->lro);
 		rxq->lro.ifp = NULL;
 	}
 #endif
 
 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
 	if (rc == 0)
 		bzero(rxq, sizeof(*rxq));
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
     int intr_idx, int idx, struct sysctl_oid *oid)
 {
 	struct port_info *pi = vi->pi;
 	int rc;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx,
 	    pi->rx_chan_map);
 	if (rc != 0)
 		return (rc);
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &ofld_rxq->iq.ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    ofld_rxq->iq.qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16,
 	    "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 
 	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
 {
 	int rc;
 
 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
 	if (rc == 0)
 		bzero(ofld_rxq, sizeof(*ofld_rxq));
 
 	return (rc);
 }
 #endif
 
 #ifdef DEV_NETMAP
 static int
 alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
     int idx, struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_oid_list *children;
 	struct sysctl_ctx_list *ctx;
 	char name[16];
 	size_t len;
 	struct adapter *sc = vi->pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 
 	MPASS(na != NULL);
 
 	len = vi->qsize_rxq * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
 	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
 	if (rc != 0)
 		return (rc);
 
 	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
 	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
 	if (rc != 0)
 		return (rc);
 
 	nm_rxq->vi = vi;
 	nm_rxq->nid = idx;
 	nm_rxq->iq_cidx = 0;
 	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
 	nm_rxq->iq_gen = F_RSPD_GEN;
 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
 	nm_rxq->fl_sidx = na->num_rx_desc;
 	nm_rxq->intr_idx = intr_idx;
 
 	ctx = &vi->ctx;
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
 	    "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
 	    "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 
 	children = SYSCTL_CHILDREN(oid);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
 	    "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
 	    &nm_rxq->fl_cidx, 0, "consumer index");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
 	    &nm_rxq->fl_pidx, 0, "producer index");
 
 	return (rc);
 }
 
 
 static int
 free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
 	    nm_rxq->iq_desc);
 	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
 	    nm_rxq->fl_desc);
 
 	return (0);
 }
 
 static int
 alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	size_t len;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
 	    &nm_txq->ba, (void **)&nm_txq->desc);
 	if (rc)
 		return (rc);
 
 	nm_txq->pidx = nm_txq->cidx = 0;
 	nm_txq->sidx = na->num_tx_desc;
 	nm_txq->nid = idx;
 	nm_txq->iqidx = iqidx;
 	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
 	    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
 	    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "netmap tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 
 	return (rc);
 }
 
 static int
 free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
 	    nm_txq->desc);
 
 	return (0);
 }
 #endif
 
 static int
 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 static int
 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create Ethernet egress queue: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
 	    V_FW_EQ_OFLD_CMD_VFN(0));
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create egress queue for TCP offload: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 #endif
 
 static int
 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, qsize;
 	size_t len;
 
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 
 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
 	    &eq->ba, (void **)&eq->desc);
 	if (rc)
 		return (rc);
 
 	eq->pidx = eq->cidx = 0;
 	eq->equeqidx = eq->dbidx = 0;
 	eq->doorbells = sc->doorbells;
 
 	switch (eq->flags & EQ_TYPEMASK) {
 	case EQ_CTRL:
 		rc = ctrl_eq_alloc(sc, eq);
 		break;
 
 	case EQ_ETH:
 		rc = eth_eq_alloc(sc, vi, eq);
 		break;
 
 #ifdef TCP_OFFLOAD
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, vi, eq);
 		break;
 #endif
 
 	default:
 		panic("%s: invalid eq type %d.", __func__,
 		    eq->flags & EQ_TYPEMASK);
 	}
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to allocate egress queue(%d): %d\n",
 		    eq->flags & EQ_TYPEMASK, rc);
 	}
 
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 		uint32_t mask = (1 << s_qpp) - 1;
 		volatile uint8_t *udb;
 
 		udb = sc->udbs_base + UDBS_DB_OFFSET;
 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
 		else {
 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
 			eq->udb_qid = 0;
 		}
 		eq->udb = (volatile void *)udb;
 	}
 
 	return (rc);
 }
 
 static int
 free_eq(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc;
 
 	if (eq->flags & EQ_ALLOCATED) {
 		switch (eq->flags & EQ_TYPEMASK) {
 		case EQ_CTRL:
 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 		case EQ_ETH:
 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 #ifdef TCP_OFFLOAD
 		case EQ_OFLD:
 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 #endif
 
 		default:
 			panic("%s: invalid eq type %d.", __func__,
 			    eq->flags & EQ_TYPEMASK);
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to free egress queue (%d): %d\n",
 			    eq->flags & EQ_TYPEMASK, rc);
 			return (rc);
 		}
 		eq->flags &= ~EQ_ALLOCATED;
 	}
 
 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
 
 	if (mtx_initialized(&eq->eq_lock))
 		mtx_destroy(&eq->eq_lock);
 
 	bzero(eq, sizeof(*eq));
 	return (0);
 }
 
 static int
 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = alloc_eq(sc, vi, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	wrq->adapter = sc;
 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
 	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
 	wrq->nwr_pending = 0;
 	wrq->ndesc_needed = 0;
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &wrq->eq.ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx, "status page index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
 
 	return (rc);
 }
 
 static int
 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 	int rc;
 
 	rc = free_eq(sc, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	bzero(wrq, sizeof(*wrq));
 	return (0);
 }
 
 static int
 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
 	    M_CXGBE, M_WAITOK);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
 	}
 
 	rc = alloc_eq(sc, vi, eq);
 	if (rc != 0) {
 		mp_ring_free(txq->r);
 		txq->r = NULL;
 		return (rc);
 	}
 
 	/* Can't fail after this point. */
 
 	if (idx == 0)
 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
 	else
 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
 		    ("eq_base mismatch"));
 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero eq_base"));
 
 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
 	txq->ifp = vi->ifp;
 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
 	if (sc->flags & IS_VF)
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 		    V_TXPKT_INTF(pi->tx_chan));
 	else
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 		    V_TXPKT_INTF(pi->tx_chan) |
 		    V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
 		    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
 		    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
 	txq->tc_idx = -1;
 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &eq->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
 	    &eq->abs_id, 0, "absolute id of the queue");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &eq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    eq->sidx, "status page index");
 
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I",
 	    "traffic class (-1 means none)");
 
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
 	    &txq->txcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
 	    CTLFLAG_RD, &txq->vlan_insertion,
 	    "# of times hardware inserted 802.1Q tag");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
 	    &txq->tso_wrs, "# of TSO work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
 	    &txq->imm_wrs, "# of work requests with immediate data");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
 	    CTLFLAG_RD, &txq->txpkts0_wrs,
 	    "# of txpkts (type 0) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
 	    CTLFLAG_RD, &txq->txpkts1_wrs,
 	    "# of txpkts (type 1) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
 	    CTLFLAG_RD, &txq->txpkts0_pkts,
 	    "# of frames tx'd using type0 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
 	    CTLFLAG_RD, &txq->txpkts1_pkts,
 	    "# of frames tx'd using type1 txpkts work requests");
 
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
 	    CTLFLAG_RD, &txq->r->enqueues,
 	    "# of enqueues to the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
 	    CTLFLAG_RD, &txq->r->drops,
 	    "# of drops in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
 	    CTLFLAG_RD, &txq->r->starts,
 	    "# of normal consumer starts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
 	    CTLFLAG_RD, &txq->r->stalls,
 	    "# of consumer stalls in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
 	    CTLFLAG_RD, &txq->r->restarts,
 	    "# of consumer restarts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
 	    CTLFLAG_RD, &txq->r->abdications,
 	    "# of consumer abdications in the mp_ring for this queue");
 
 	return (0);
 }
 
 static int
 free_txq(struct vi_info *vi, struct sge_txq *txq)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 
 	rc = free_eq(sc, eq);
 	if (rc)
 		return (rc);
 
 	sglist_free(txq->gl);
 	free(txq->sdesc, M_CXGBE);
 	mp_ring_free(txq->r);
 
 	bzero(txq, sizeof(*txq));
 	return (0);
 }
 
 static void
 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	bus_addr_t *ba = arg;
 
 	KASSERT(nseg == 1,
 	    ("%s meant for single segment mappings only.", __func__));
 
 	*ba = error ? 0 : segs->ds_addr;
 }
 
 static inline void
 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
 {
 	uint32_t n, v;
 
 	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
 	MPASS(n > 0);
 
 	wmb();
 	v = fl->dbval | V_PIDX(n);
 	if (fl->udb)
 		*fl->udb = htole32(v);
 	else
 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
 	IDXINCR(fl->dbidx, n, fl->sidx);
 }
 
 /*
  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
  * recycled do not count towards this allocation budget.
  *
  * Returns non-zero to indicate that this freelist should be added to the list
  * of starving freelists.
  */
 static int
 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
 {
 	__be64 *d;
 	struct fl_sdesc *sd;
 	uintptr_t pa;
 	caddr_t cl;
 	struct cluster_layout *cll;
 	struct sw_zone_info *swz;
 	struct cluster_metadata *clm;
 	uint16_t max_pidx;
 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
 
 	FL_LOCK_ASSERT_OWNED(fl);
 
 	/*
 	 * We always stop at the beginning of the hardware descriptor that's just
 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
 	 * which would mean an empty freelist to the chip.
 	 */
 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
 	if (fl->pidx == max_pidx * 8)
 		return (0);
 
 	d = &fl->desc[fl->pidx];
 	sd = &fl->sdesc[fl->pidx];
 	cll = &fl->cll_def;	/* default layout */
 	swz = &sc->sge.sw_zone_info[cll->zidx];
 
 	while (n > 0) {
 
 		if (sd->cl != NULL) {
 
 			if (sd->nmbuf == 0) {
 				/*
 				 * Fast recycle without involving any atomics on
 				 * the cluster's metadata (if the cluster has
 				 * metadata).  This happens when all frames
 				 * received in the cluster were small enough to
 				 * fit within a single mbuf each.
 				 */
 				fl->cl_fast_recycled++;
 #ifdef INVARIANTS
 				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
 				if (clm != NULL)
 					MPASS(clm->refcount == 1);
 #endif
 				goto recycled_fast;
 			}
 
 			/*
 			 * Cluster is guaranteed to have metadata.  Clusters
 			 * without metadata always take the fast recycle path
 			 * when they're recycled.
 			 */
 			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
 			MPASS(clm != NULL);
 
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				fl->cl_recycled++;
 				counter_u64_add(extfree_rels, 1);
 				goto recycled;
 			}
 			sd->cl = NULL;	/* gave up my reference */
 		}
 		MPASS(sd->cl == NULL);
 alloc:
 		cl = uma_zalloc(swz->zone, M_NOWAIT);
 		if (__predict_false(cl == NULL)) {
 			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
 			    fl->cll_def.zidx == fl->cll_alt.zidx)
 				break;
 
 			/* fall back to the safe zone */
 			cll = &fl->cll_alt;
 			swz = &sc->sge.sw_zone_info[cll->zidx];
 			goto alloc;
 		}
 		fl->cl_allocated++;
 		n--;
 
 		pa = pmap_kextract((vm_offset_t)cl);
 		pa += cll->region1;
 		sd->cl = cl;
 		sd->cll = *cll;
 		*d = htobe64(pa | cll->hwidx);
 		clm = cl_metadata(sc, fl, cll, cl);
 		if (clm != NULL) {
 recycled:
 #ifdef INVARIANTS
 			clm->sd = sd;
 #endif
 			clm->refcount = 1;
 		}
 		sd->nmbuf = 0;
 recycled_fast:
 		d++;
 		sd++;
 		if (__predict_false(++fl->pidx % 8 == 0)) {
 			uint16_t pidx = fl->pidx / 8;
 
 			if (__predict_false(pidx == fl->sidx)) {
 				fl->pidx = 0;
 				pidx = 0;
 				sd = fl->sdesc;
 				d = fl->desc;
 			}
 			if (pidx == max_pidx)
 				break;
 
 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
 				ring_fl_db(sc, fl);
 		}
 	}
 
 	if (fl->pidx / 8 != fl->dbidx)
 		ring_fl_db(sc, fl);
 
 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
 }
 
 /*
  * Attempt to refill all starving freelists.
  */
 static void
 refill_sfl(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_fl *fl, *fl_temp;
 
 	mtx_assert(&sc->sfl_lock, MA_OWNED);
 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
 		FL_LOCK(fl);
 		refill_fl(sc, fl, 64);
 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
 			TAILQ_REMOVE(&sc->sfl, fl, link);
 			fl->flags &= ~FL_STARVING;
 		}
 		FL_UNLOCK(fl);
 	}
 
 	if (!TAILQ_EMPTY(&sc->sfl))
 		callout_schedule(&sc->sfl_callout, hz / 5);
 }
 
 static int
 alloc_fl_sdesc(struct sge_fl *fl)
 {
 
 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	return (0);
 }
 
 static void
 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
 {
 	struct fl_sdesc *sd;
 	struct cluster_metadata *clm;
 	struct cluster_layout *cll;
 	int i;
 
 	sd = fl->sdesc;
 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
 		if (sd->cl == NULL)
 			continue;
 
 		cll = &sd->cll;
 		clm = cl_metadata(sc, fl, cll, sd->cl);
 		if (sd->nmbuf == 0)
 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
 		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
 			counter_u64_add(extfree_rels, 1);
 		}
 		sd->cl = NULL;
 	}
 
 	free(fl->sdesc, M_CXGBE);
 	fl->sdesc = NULL;
 }
 
 static inline void
 get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 
 	sglist_reset(gl);
 	rc = sglist_append_mbuf(gl, m);
 	if (__predict_false(rc != 0)) {
 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
 
 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
 	    mbuf_nsegs(m), gl->sg_nseg));
 	KASSERT(gl->sg_nseg > 0 &&
 	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
 		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
  * request header.
  */
 static inline u_int
 txpkt_vm_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts0_len16(u_int nsegs)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts1_len16(void)
 {
 	u_int n;
 
 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
 	return (howmany(n, 16));
 }
 
 static inline u_int
 imm_payload(u_int ndesc)
 {
 	u_int n;
 
 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
 	    sizeof(struct cpl_tx_pkt_core);
 
 	return (n);
 }
 
 /*
  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int csum_type, len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3[0] = 0;
 	wr->r3[1] = 0;
 	
 	/*
 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
 	 * simpler to always copy it rather than making it
 	 * conditional.  Also, it seems that we do not have to set
 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
 	 */
 	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
 
 	csum_type = -1;
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			csum_type = TX_CSUM_TCPIP6;
 		else
 			csum_type = TX_CSUM_TCPIP;
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else {
 		if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP)
 			csum_type = TX_CSUM_TCPIP;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP)
 			csum_type = TX_CSUM_UDPIP;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP)
 			csum_type = TX_CSUM_TCPIP6;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP)
 			csum_type = TX_CSUM_UDPIP6;
 #if defined(INET)
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP) {
 			/*
 			 * XXX: The firmware appears to stomp on the
 			 * fragment/flags field of the IP header when
 			 * using TX_CSUM_IP.  Fall back to doing
 			 * software checksums.
 			 */
 			u_short *sump;
 			struct mbuf *m;
 			int offset;
 
 			m = m0;
 			offset = 0;
 			sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen +
 			    offsetof(struct ip, ip_sum));
 			*sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen +
 			    m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen);
 			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 #endif
 
 		cpl = (void *)(wr + 1);
 	}
 
 	/* Checksum offload */
 	ctrl1 = 0;
 	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
 	if (csum_type >= 0) {
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0,
 	    ("%s: mbuf %p needs checksum offload but missing header lengths",
 			__func__, m0));
 
 		if (chip_id(sc) <= CHELSIO_T5) {
 			ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
 			    ETHER_HDR_LEN);
 		} else {
 			ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
 			    ETHER_HDR_LEN);
 		}
 		ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen);
 		ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type);
 	} else
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 
 	/*
 	 * A packet using TSO will use up an entire descriptor for the
 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
 	 * If this descriptor is the last descriptor in the ring, wrap
 	 * around to the front of the ring explicitly for the start of
 	 * the sgl.
 	 */
 	if (dst == (void *)&eq->desc[eq->sidx]) {
 		dst = (void *)&eq->desc[0];
 		write_gl_to_txd(txq, m0, &dst, 0);
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
     struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	else if (pktlen <= imm_payload(2) && available >= 2) {
 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
 		nsegs = 0;
 	}
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = 0;
 	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
 	if (needs_l4_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
 	} else {
 		struct mbuf *m;
 
 		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
 #endif
 		}
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
 		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
 {
 	u_int needed, nsegs1, nsegs2, l1, l2;
 
 	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
 		return (1);
 
 	nsegs1 = mbuf_nsegs(m);
 	nsegs2 = mbuf_nsegs(n);
 	if (nsegs1 + nsegs2 == 2) {
 		txp->wr_type = 1;
 		l1 = l2 = txpkts1_len16();
 	} else {
 		txp->wr_type = 0;
 		l1 = txpkts0_len16(nsegs1);
 		l2 = txpkts0_len16(nsegs2);
 	}
 	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
 	needed = howmany(txp->len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
 	if (txp->plen > 65535)
 		return (1);
 
 	txp->npkt = 2;
 	set_mbuf_len16(m, l1);
 	set_mbuf_len16(n, l2);
 
 	return (0);
 }
 
 static int
 add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
 {
 	u_int plen, len16, needed, nsegs;
 
 	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 
 	nsegs = mbuf_nsegs(m);
 	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
 		return (1);
 
 	plen = txp->plen + m->m_pkthdr.len;
 	if (plen > 65535)
 		return (1);
 
 	if (txp->wr_type == 0)
 		len16 = txpkts0_len16(nsegs);
 	else
 		len16 = txpkts1_len16();
 	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->npkt++;
 	txp->plen = plen;
 	txp->len16 += len16;
 	set_mbuf_len16(m, len16);
 
 	return (0);
 }
 
 /*
  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
  * the software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
     struct mbuf *m0, const struct txpkts *txp, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int ndesc, checkwrap;
 	struct mbuf *m;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
 	MPASS(txp->plen < 65536);
 	MPASS(m0 != NULL);
 	MPASS(m0->m_nextpkt != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
 	MPASS(available > 0 && available < eq->sidx);
 
 	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	ctrl = V_FW_WR_LEN16(txp->len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
 	wr->type = txp->wr_type;
 	flitp = wr + 1;
 
 	/*
 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
 	checkwrap = eq->sidx - ndesc < eq->pidx;
 	for (m = m0; m != NULL; m = m->m_nextpkt) {
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
 
 			/* ULP master command */
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
 			ulpmc->len = htobe32(mbuf_len16(m));
 
 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 			    F_ULP_TX_SC_MORE);
 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
 
 			cpl = (void *)(ulpsc + 1);
 			if (checkwrap &&
 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
 				cpl = (void *)&eq->desc[0];
 			txq->txpkts0_pkts += txp->npkt;
 			txq->txpkts0_wrs++;
 		} else {
 			cpl = flitp;
 			txq->txpkts1_pkts += txp->npkt;
 			txq->txpkts1_wrs++;
 		}
 
 		/* Checksum offload */
 		ctrl1 = 0;
 		if (needs_l3_csum(m) == 0)
 			ctrl1 |= F_TXPKT_IPCSUM_DIS;
 		if (needs_l4_csum(m) == 0)
 			ctrl1 |= F_TXPKT_L4CSUM_DIS;
 		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 			txq->txcsum++;	/* some hardware assistance provided */
 
 		/* VLAN tag insertion */
 		if (needs_vlan_insertion(m)) {
 			ctrl1 |= F_TXPKT_VLAN_VLD |
 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
 			txq->vlan_insertion++;
 		}
 
 		/* CPL header */
 		cpl->ctrl0 = txq->cpl_ctrl0;
 		cpl->pack = 0;
 		cpl->len = htobe16(m->m_pkthdr.len);
 		cpl->ctrl1 = htobe64(ctrl1);
 
 		flitp = cpl + 1;
 		if (checkwrap &&
 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
 			flitp = (void *)&eq->desc[0];
 
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
 	}
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sglist *gl = txq->gl;
 	struct sglist_seg *seg;
 	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	get_pkt_gl(m, gl);
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
 	 * ring, so we're at least 16 bytes away from the status page.  There is
 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
 		for (i = 0; i < nsegs - 1; i++, seg++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
 		for (i = 0; i < nflits - 2; i++) {
 			if (flitp == wrap)
 				flitp = (void *)eq->desc;
 			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
 	}
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
 		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static inline void
 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
 	u_int db;
 
 	MPASS(n > 0);
 
 	db = eq->doorbells;
 	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
 		int i;
 
 		/*
 		 * Queues whose 128B doorbell segment fits in the page do not
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
 		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
 		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
 		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
 		break;
 	}
 
 	case DOORBELL_UDBWC:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
 		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    V_QID(eq->cntxt_id) | V_PIDX(n));
 		break;
 	}
 
 	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
 static inline u_int
 reclaimable_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
 }
 
 static inline u_int
 total_available_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx, pidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	pidx = eq->pidx;
 
 	if (pidx == hw_cidx)
 		return (eq->sidx - 1);
 	else
 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
 }
 
 static inline uint16_t
 read_hw_cidx(struct sge_eq *eq)
 {
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 	uint16_t cidx = spg->cidx;	/* stable snapshot */
 
 	return (be16toh(cidx));
 }
 
 /*
  * Reclaim 'n' descriptors approximately.
  */
 static u_int
 reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
 	struct sge_eq *eq = &txq->eq;
 	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(n > 0);
 
 	reclaimed = 0;
 	can_reclaim = reclaimable_tx_desc(eq);
 	while (can_reclaim && reclaimed < n) {
 		int ndesc;
 		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
 
 		/* Firmware doesn't return "partial" credits. */
 		KASSERT(can_reclaim >= ndesc,
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 
 		for (m = txsd->m; m != NULL; m = nextpkt) {
 			nextpkt = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
 		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
 tx_reclaim(void *arg, int n)
 {
 	struct sge_txq *txq = arg;
 	struct sge_eq *eq = &txq->eq;
 
 	do {
 		if (TXQ_TRYLOCK(txq) == 0)
 			break;
 		n = reclaim_tx_descs(txq, 32);
 		if (eq->cidx == eq->pidx)
 			eq->equeqidx = eq->pidx;
 		TXQ_UNLOCK(txq);
 	} while (n > 0);
 }
 
 static __be64
 get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
 	switch (idx % 3) {
 	case 0: {
 		__be64 rc;
 
 		rc = htobe32(segs[i].ss_len);
 		if (i + 1 < nsegs)
 			rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32;
 
 		return (rc);
 	}
 	case 1:
 		return (htobe64(segs[i].ss_paddr));
 	case 2:
 		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
 }
 
 static void
 find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
 {
 	int8_t zidx, hwidx, idx;
 	uint16_t region1, region3;
 	int spare, spare_needed, n;
 	struct sw_zone_info *swz;
 	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
 
 	/*
 	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
 	 * large enough for the max payload and cluster metadata.  Otherwise
 	 * settle for the largest bufsize that leaves enough room in the cluster
 	 * for metadata.
 	 *
 	 * Without buffer packing: Look for the smallest zone which has a
 	 * bufsize large enough for the max payload.  Settle for the largest
 	 * bufsize available if there's nothing big enough for max payload.
 	 */
 	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
 	swz = &sc->sge.sw_zone_info[0];
 	hwidx = -1;
 	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
 		if (swz->size > largest_rx_cluster) {
 			if (__predict_true(hwidx != -1))
 				break;
 
 			/*
 			 * This is a misconfiguration.  largest_rx_cluster is
 			 * preventing us from finding a refill source.  See
 			 * dev.t5nex.<n>.buffer_sizes to figure out why.
 			 */
 			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
 			    " refill source for fl %p (dma %u).  Ignored.\n",
 			    largest_rx_cluster, fl, maxp);
 		}
 		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
 			hwb = &hwb_list[idx];
 			spare = swz->size - hwb->size;
 			if (spare < spare_needed)
 				continue;
 
 			hwidx = idx;		/* best option so far */
 			if (hwb->size >= maxp) {
 
 				if ((fl->flags & FL_BUF_PACKING) == 0)
 					goto done; /* stop looking (not packing) */
 
 				if (swz->size >= safest_rx_cluster)
 					goto done; /* stop looking (packing) */
 			}
 			break;		/* keep looking, next zone */
 		}
 	}
 done:
 	/* A usable hwidx has been located. */
 	MPASS(hwidx != -1);
 	hwb = &hwb_list[hwidx];
 	zidx = hwb->zidx;
 	swz = &sc->sge.sw_zone_info[zidx];
 	region1 = 0;
 	region3 = swz->size - hwb->size;
 
 	/*
 	 * Stay within this zone and see if there is a better match when mbuf
 	 * inlining is allowed.  Remember that the hwidx's are sorted in
 	 * decreasing order of size (so in increasing order of spare area).
 	 */
 	for (idx = hwidx; idx != -1; idx = hwb->next) {
 		hwb = &hwb_list[idx];
 		spare = swz->size - hwb->size;
 
 		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
 			break;
 
 		/*
 		 * Do not inline mbufs if doing so would violate the pad/pack
 		 * boundary alignment requirement.
 		 */
 		if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0)
 			continue;
 		if (fl->flags & FL_BUF_PACKING &&
 		    (MSIZE % sc->params.sge.pack_boundary) != 0)
 			continue;
 
 		if (spare < CL_METADATA_SIZE + MSIZE)
 			continue;
 		n = (spare - CL_METADATA_SIZE) / MSIZE;
 		if (n > howmany(hwb->size, maxp))
 			break;
 
 		hwidx = idx;
 		if (fl->flags & FL_BUF_PACKING) {
 			region1 = n * MSIZE;
 			region3 = spare - region1;
 		} else {
 			region1 = MSIZE;
 			region3 = spare - region1;
 			break;
 		}
 	}
 
 	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
 	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
 	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
 	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
 	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
 	    sc->sge.sw_zone_info[zidx].size,
 	    ("%s: bad buffer layout for fl %p, maxp %d. "
 		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		sc->sge.sw_zone_info[zidx].size, region1,
 		sc->sge.hw_buf_info[hwidx].size, region3));
 	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
 		KASSERT(region3 >= CL_METADATA_SIZE,
 		    ("%s: no room for metadata.  fl %p, maxp %d; "
 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		    sc->sge.sw_zone_info[zidx].size, region1,
 		    sc->sge.hw_buf_info[hwidx].size, region3));
 		KASSERT(region1 % MSIZE == 0,
 		    ("%s: bad mbuf region for fl %p, maxp %d. "
 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		    sc->sge.sw_zone_info[zidx].size, region1,
 		    sc->sge.hw_buf_info[hwidx].size, region3));
 	}
 
 	fl->cll_def.zidx = zidx;
 	fl->cll_def.hwidx = hwidx;
 	fl->cll_def.region1 = region1;
 	fl->cll_def.region3 = region3;
 }
 
 static void
 find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
 {
 	struct sge *s = &sc->sge;
 	struct hw_buf_info *hwb;
 	struct sw_zone_info *swz;
 	int spare;
 	int8_t hwidx;
 
 	if (fl->flags & FL_BUF_PACKING)
 		hwidx = s->safe_hwidx2;	/* with room for metadata */
 	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
 		hwidx = s->safe_hwidx2;
 		hwb = &s->hw_buf_info[hwidx];
 		swz = &s->sw_zone_info[hwb->zidx];
 		spare = swz->size - hwb->size;
 
 		/* no good if there isn't room for an mbuf as well */
 		if (spare < CL_METADATA_SIZE + MSIZE)
 			hwidx = s->safe_hwidx1;
 	} else
 		hwidx = s->safe_hwidx1;
 
 	if (hwidx == -1) {
 		/* No fallback source */
 		fl->cll_alt.hwidx = -1;
 		fl->cll_alt.zidx = -1;
 
 		return;
 	}
 
 	hwb = &s->hw_buf_info[hwidx];
 	swz = &s->sw_zone_info[hwb->zidx];
 	spare = swz->size - hwb->size;
 	fl->cll_alt.hwidx = hwidx;
 	fl->cll_alt.zidx = hwb->zidx;
 	if (allow_mbufs_in_cluster &&
 	    (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0))
 		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
 	else
 		fl->cll_alt.region1 = 0;
 	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
 }
 
 static void
 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	if ((fl->flags & FL_DOOMED) == 0) {
 		fl->flags |= FL_STARVING;
 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
 	}
 	FL_UNLOCK(fl);
 	mtx_unlock(&sc->sfl_lock);
 }
 
 static void
 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_wrq *wrq = (void *)eq;
 
 	atomic_readandclear_int(&eq->equiq);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
 }
 
 static void
 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_txq *txq = (void *)eq;
 
 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
 
 	atomic_readandclear_int(&eq->equiq);
 	mp_ring_check_drainage(txq->r, 0);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
 }
 
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
 		&handle_wrq_egr_update, &handle_eth_egr_update,
 		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
 
 	return (0);
 }
 
 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
     offsetof(struct cpl_fw6_msg, data));
 
 static int
 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
 		const struct rss_header *rss2;
 
 		rss2 = (const struct rss_header *)&cpl->data[0];
 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
 	}
 
 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
 }
 
 /**
  *	t4_handle_wrerr_rpl - process a FW work request error message
  *	@adap: the adapter
  *	@rpl: start of the FW message
  */
 static int
 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
 {
 	u8 opcode = *(const u8 *)rpl;
 	const struct fw_error_cmd *e = (const void *)rpl;
 	unsigned int i;
 
 	if (opcode != FW_ERROR_CMD) {
 		log(LOG_ERR,
 		    "%s: Received WRERR_RPL message with opcode %#x\n",
 		    device_get_nameunit(adap->dev), opcode);
 		return (EINVAL);
 	}
 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
 	    "non-fatal");
 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
 	case FW_ERROR_TYPE_EXCEPTION:
 		log(LOG_ERR, "exception info:\n");
 		for (i = 0; i < nitems(e->u.exception.info); i++)
 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
 			    be32toh(e->u.exception.info[i]));
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_HWMODULE:
 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
 		    be32toh(e->u.hwmodule.regaddr),
 		    be32toh(e->u.hwmodule.regval));
 		break;
 	case FW_ERROR_TYPE_WR:
 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
 		    be16toh(e->u.wr.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
 		    be32toh(e->u.wr.eqid));
 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
 			    e->u.wr.wrhdr[i]);
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_ACL:
 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
 		    be16toh(e->u.acl.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
 		    be32toh(e->u.acl.eqid),
 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
 		    "MAC");
 		for (i = 0; i < nitems(e->u.acl.val); i++)
 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
 		log(LOG_ERR, "\n");
 		break;
 	default:
 		log(LOG_ERR, "type %#x\n",
 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 sysctl_uint16(SYSCTL_HANDLER_ARGS)
 {
 	uint16_t *id = arg1;
 	int i = *id;
 
 	return sysctl_handle_int(oidp, &i, 0, req);
 }
 
 static int
 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
 {
 	struct sge *s = arg1;
 	struct hw_buf_info *hwb = &s->hw_buf_info[0];
 	struct sw_zone_info *swz = &s->sw_zone_info[0];
 	int i, rc;
 	struct sbuf sb;
 	char c;
 
 	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
 	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
 		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
 			c = '*';
 		else
 			c = '\0';
 
 		sbuf_printf(&sb, "%u%c ", hwb->size, c);
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 static int
 sysctl_tc(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct port_info *pi;
 	struct adapter *sc;
 	struct sge_txq *txq;
 	struct tx_sched_class *tc;
 	int qidx = arg2, rc, tc_idx;
 	uint32_t fw_queue, fw_class;
 
 	MPASS(qidx >= 0 && qidx < vi->ntxq);
 	pi = vi->pi;
 	sc = pi->adapter;
 	txq = &sc->sge.txq[vi->first_txq + qidx];
 
 	tc_idx = txq->tc_idx;
 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	/* Note that -1 is legitimate input (it means unbind). */
 	if (tc_idx < -1 || tc_idx >= sc->chip_params->nsched_cls)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4stc");
 	if (rc)
 		return (rc);
 
 	if (tc_idx == txq->tc_idx) {
 		rc = 0;		/* No change, nothing to do. */
 		goto done;
 	}
 
 	fw_queue = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
 	    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id);
 
 	if (tc_idx == -1)
 		fw_class = 0xffffffff;	/* Unbind. */
 	else {
 		/*
 		 * Bind to a different class.  Ethernet txq's are only allowed
 		 * to bind to cl-rl mode-class for now.  XXX: too restrictive.
 		 */
 		tc = &pi->tc[tc_idx];
 		if (tc->flags & TX_SC_OK &&
 		    tc->params.level == SCHED_CLASS_LEVEL_CL_RL &&
 		    tc->params.mode == SCHED_CLASS_MODE_CLASS) {
 			/* Ok to proceed. */
 			fw_class = tc_idx;
 		} else {
 			rc = tc->flags & TX_SC_OK ? EBUSY : ENXIO;
 			goto done;
 		}
 	}
 
 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class);
 	if (rc == 0) {
 		if (txq->tc_idx != -1) {
 			tc = &pi->tc[txq->tc_idx];
 			MPASS(tc->refcount > 0);
 			tc->refcount--;
 		}
 		if (tc_idx != -1) {
 			tc = &pi->tc[tc_idx];
 			tc->refcount++;
 		}
 		txq->tc_idx = tc_idx;
 	}
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	(revision 307896)
@@ -1,4233 +1,4234 @@
 /*-
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2004-2006 Kip Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/buf_ring.h>
 
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/ethernet.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/rndis.h>
 #include <net/bpf.h>
 
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/if_ether.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/ip6.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <machine/frame.h>
 
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/mutex.h>
 #include <sys/errno.h>
 #include <sys/types.h>
 #include <machine/atomic.h>
 
 #include <machine/intr_machdep.h>
 
 #include <machine/in_cksum.h>
 
 #include <dev/hyperv/include/hyperv.h>
 #include <dev/hyperv/include/hyperv_busdma.h>
 #include <dev/hyperv/include/vmbus_xact.h>
 
 #include <dev/hyperv/netvsc/hv_net_vsc.h>
 #include <dev/hyperv/netvsc/hv_rndis_filter.h>
 #include <dev/hyperv/netvsc/ndis.h>
 
 #include "vmbus_if.h"
 
 /* Short for Hyper-V network interface */
 #define NETVSC_DEVNAME    "hn"
 
 /*
  * It looks like offset 0 of buf is reserved to hold the softc pointer.
  * The sc pointer evidently not needed, and is not presently populated.
  * The packet offset is where the netvsc_packet starts in the buffer.
  */
 #define HV_NV_SC_PTR_OFFSET_IN_BUF         0
 #define HV_NV_PACKET_OFFSET_IN_BUF         16
 
 /* YYY should get it from the underlying channel */
 #define HN_TX_DESC_CNT			512
 
 #define HN_LROENT_CNT_DEF		128
 
 #define HN_RING_CNT_DEF_MAX		8
 
 #define HN_RNDIS_PKT_LEN					\
 	(sizeof(struct rndis_packet_msg) +			\
 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
 
 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
 /* -1 for RNDIS packet message */
 #define HN_TX_DATA_SEGCNT_MAX		(NETVSC_PACKET_MAXPAGE - 1)
 
 #define HN_DIRECT_TX_SIZE_DEF		128
 
 #define HN_EARLY_TXEOF_THRESH		8
 
 struct hn_txdesc {
 #ifndef HN_USE_TXDESC_BUFRING
 	SLIST_ENTRY(hn_txdesc) link;
 #endif
 	struct mbuf	*m;
 	struct hn_tx_ring *txr;
 	int		refs;
 	uint32_t	flags;		/* HN_TXD_FLAG_ */
 	struct hn_send_ctx send_ctx;
 	uint32_t	chim_index;
 	int		chim_size;
 
 	bus_dmamap_t	data_dmap;
 
 	bus_addr_t	rndis_pkt_paddr;
 	struct rndis_packet_msg *rndis_pkt;
 	bus_dmamap_t	rndis_pkt_dmap;
 };
 
 #define HN_TXD_FLAG_ONLIST	0x1
 #define HN_TXD_FLAG_DMAMAP	0x2
 
 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
 /* YYY 2*MTU is a bit rough, but should be good enough. */
 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
 
 #define HN_LRO_ACKCNT_DEF		1
 
 #define HN_LOCK_INIT(sc)		\
 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
 #define HN_LOCK(sc)			sx_xlock(&(sc)->hn_lock)
 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
 
 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
 #define HN_CSUM_IP_HWASSIST(sc)		\
 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
 #define HN_CSUM_IP6_HWASSIST(sc)	\
 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
 
 /*
  * Globals
  */
 
 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "Hyper-V network interface");
 
 /* Trust tcp segements verification on host side. */
 static int hn_trust_hosttcp = 1;
 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
     &hn_trust_hosttcp, 0,
     "Trust tcp segement verification on host side, "
     "when csum info is missing (global setting)");
 
 /* Trust udp datagrams verification on host side. */
 static int hn_trust_hostudp = 1;
 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
     &hn_trust_hostudp, 0,
     "Trust udp datagram verification on host side, "
     "when csum info is missing (global setting)");
 
 /* Trust ip packets verification on host side. */
 static int hn_trust_hostip = 1;
 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
     &hn_trust_hostip, 0,
     "Trust ip packet verification on host side, "
     "when csum info is missing (global setting)");
 
 /* Limit TSO burst size */
 static int hn_tso_maxlen = IP_MAXPACKET;
 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
     &hn_tso_maxlen, 0, "TSO burst limit");
 
 /* Limit chimney send size */
 static int hn_tx_chimney_size = 0;
 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
 
 /* Limit the size of packet for direct transmission */
 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
 
 #if defined(INET) || defined(INET6)
 #if __FreeBSD_version >= 1100095
 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
     &hn_lro_entry_count, 0, "LRO entry count");
 #endif
 #endif
 
 static int hn_share_tx_taskq = 0;
 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
     &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
 
 static struct taskqueue	*hn_tx_taskq;
 
 #ifndef HN_USE_TXDESC_BUFRING
 static int hn_use_txdesc_bufring = 0;
 #else
 static int hn_use_txdesc_bufring = 1;
 #endif
 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
 
 static int hn_bind_tx_taskq = -1;
 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
     &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
 
 static int hn_use_if_start = 0;
 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
     &hn_use_if_start, 0, "Use if_start TX method");
 
 static int hn_chan_cnt = 0;
 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
     &hn_chan_cnt, 0,
     "# of channels to use; each channel has one RX ring and one TX ring");
 
 static int hn_tx_ring_cnt = 0;
 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
     &hn_tx_ring_cnt, 0, "# of TX rings to use");
 
 static int hn_tx_swq_depth = 0;
 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
 
 #if __FreeBSD_version >= 1100095
 static u_int hn_lro_mbufq_depth = 0;
 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
 #endif
 
 static u_int hn_cpu_index;
 
 /*
  * Forward declarations
  */
 static void hn_stop(struct hn_softc *sc);
 static void hn_init_locked(struct hn_softc *sc);
 static void hn_init(void *xsc);
 static int  hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
 static int hn_start_locked(struct hn_tx_ring *txr, int len);
 static void hn_start(struct ifnet *ifp);
 static void hn_start_txeof(struct hn_tx_ring *);
 static int hn_ifmedia_upd(struct ifnet *ifp);
 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
 #if __FreeBSD_version >= 1100099
 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
 #endif
 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
 #if __FreeBSD_version < 1100095
 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
 #else
 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
 #endif
 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_check_iplen(const struct mbuf *, int);
 static int hn_create_tx_ring(struct hn_softc *, int);
 static void hn_destroy_tx_ring(struct hn_tx_ring *);
 static int hn_create_tx_data(struct hn_softc *, int);
 static void hn_fixup_tx_data(struct hn_softc *);
 static void hn_destroy_tx_data(struct hn_softc *);
 static void hn_start_taskfunc(void *, int);
 static void hn_start_txeof_taskfunc(void *, int);
 static void hn_link_taskfunc(void *, int);
 static void hn_netchg_init_taskfunc(void *, int);
 static void hn_netchg_status_taskfunc(void *, int);
 static void hn_suspend_mgmt_taskfunc(void *, int);
 static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
 static int hn_create_rx_data(struct hn_softc *sc, int);
 static void hn_destroy_rx_data(struct hn_softc *sc);
 static void hn_set_chim_size(struct hn_softc *, int);
 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
 static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *);
 static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *);
 static int hn_attach_subchans(struct hn_softc *);
 static void hn_detach_allchans(struct hn_softc *);
 static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr);
 static void hn_set_ring_inuse(struct hn_softc *, int);
 static int hn_synth_attach(struct hn_softc *, int);
 static void hn_synth_detach(struct hn_softc *);
 static bool hn_tx_ring_pending(struct hn_tx_ring *);
 static void hn_suspend(struct hn_softc *);
 static void hn_suspend_data(struct hn_softc *);
 static void hn_suspend_mgmt(struct hn_softc *);
 static void hn_resume(struct hn_softc *);
 static void hn_resume_data(struct hn_softc *);
 static void hn_resume_mgmt(struct hn_softc *);
 static void hn_rx_drain(struct vmbus_channel *);
 static void hn_tx_resume(struct hn_softc *, int);
 static void hn_tx_ring_qflush(struct hn_tx_ring *);
 static int netvsc_detach(device_t dev);
 static void hn_link_status(struct hn_softc *);
 static int hn_sendpkt_rndis_sglist(struct hn_tx_ring *, struct hn_txdesc *);
 static int hn_sendpkt_rndis_chim(struct hn_tx_ring *, struct hn_txdesc *);
 static int hn_set_rxfilter(struct hn_softc *);
 
 static void hn_nvs_handle_notify(struct hn_softc *sc,
 		const struct vmbus_chanpkt_hdr *pkt);
 static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
 		const struct vmbus_chanpkt_hdr *pkt);
 static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
 		struct vmbus_channel *chan,
 		const struct vmbus_chanpkt_hdr *pkthdr);
 static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid);
 
 static int hn_transmit(struct ifnet *, struct mbuf *);
 static void hn_xmit_qflush(struct ifnet *);
 static int hn_xmit(struct hn_tx_ring *, int);
 static void hn_xmit_txeof(struct hn_tx_ring *);
 static void hn_xmit_taskfunc(void *, int);
 static void hn_xmit_txeof_taskfunc(void *, int);
 
 static const uint8_t	hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
 };
 
 #if __FreeBSD_version >= 1100099
 static void
 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
 {
 	int i;
 
 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
 }
 #endif
 
 static __inline int
 hn_nvs_send_rndis_sglist1(struct vmbus_channel *chan, uint32_t rndis_mtype,
     struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt)
 {
 	struct hn_nvs_rndis rndis;
 
 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
 	rndis.nvs_rndis_mtype = rndis_mtype;
 	rndis.nvs_chim_idx = HN_NVS_CHIM_IDX_INVALID;
 	rndis.nvs_chim_sz = 0;
 
 	return (hn_nvs_send_sglist(chan, gpa, gpa_cnt,
 	    &rndis, sizeof(rndis), sndc));
 }
 
 int
 hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan,
     struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt)
 {
 
 	return hn_nvs_send_rndis_sglist1(chan, HN_NVS_RNDIS_MTYPE_CTRL,
 	    sndc, gpa, gpa_cnt);
 }
 
 static int
 hn_sendpkt_rndis_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
 {
 
 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
 	return (hn_nvs_send_rndis_sglist1(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
 }
 
 static int
 hn_sendpkt_rndis_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
 {
 	struct hn_nvs_rndis rndis;
 
 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
 	    txd->chim_size > 0, ("invalid rndis chim txd"));
 
 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
 	rndis.nvs_chim_idx = txd->chim_index;
 	rndis.nvs_chim_sz = txd->chim_size;
 
 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
 	    &rndis, sizeof(rndis), &txd->send_ctx));
 }
 
 static int
 hn_set_rxfilter(struct hn_softc *sc)
 {
 	struct ifnet *ifp = sc->hn_ifp;
 	uint32_t filter;
 	int error = 0;
 
 	HN_LOCK_ASSERT(sc);
 
 	if (ifp->if_flags & IFF_PROMISC) {
 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
 	} else {
 		filter = NDIS_PACKET_TYPE_DIRECTED;
 		if (ifp->if_flags & IFF_BROADCAST)
 			filter |= NDIS_PACKET_TYPE_BROADCAST;
 #ifdef notyet
 		/*
 		 * See the comment in SIOCADDMULTI/SIOCDELMULTI.
 		 */
 		/* TODO: support multicast list */
 		if ((ifp->if_flags & IFF_ALLMULTI) ||
 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
 #else
 		/* Always enable ALLMULTI */
 		filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
 #endif
 	}
 
 	if (sc->hn_rx_filter != filter) {
 		error = hn_rndis_set_rxfilter(sc, filter);
 		if (!error)
 			sc->hn_rx_filter = filter;
 	}
 	return (error);
 }
 
 static int
 hn_get_txswq_depth(const struct hn_tx_ring *txr)
 {
 
 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
 		return txr->hn_txdesc_cnt;
 	return hn_tx_swq_depth;
 }
 
 static int
 hn_rss_reconfig(struct hn_softc *sc)
 {
 	int error;
 
 	HN_LOCK_ASSERT(sc);
 
 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
 		return (ENXIO);
 
 	/*
 	 * Disable RSS first.
 	 *
 	 * NOTE:
 	 * Direct reconfiguration by setting the UNCHG flags does
 	 * _not_ work properly.
 	 */
 	if (bootverbose)
 		if_printf(sc->hn_ifp, "disable RSS\n");
 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
 	if (error) {
 		if_printf(sc->hn_ifp, "RSS disable failed\n");
 		return (error);
 	}
 
 	/*
 	 * Reenable the RSS w/ the updated RSS key or indirect
 	 * table.
 	 */
 	if (bootverbose)
 		if_printf(sc->hn_ifp, "reconfig RSS\n");
 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
 	if (error) {
 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
 		return (error);
 	}
 	return (0);
 }
 
 static void
 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
 {
 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
 	int i;
 
 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
 
 	/*
 	 * Check indirect table to make sure that all channels in it
 	 * can be used.
 	 */
 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
 		if (rss->rss_ind[i] >= nchan) {
 			if_printf(sc->hn_ifp,
 			    "RSS indirect table %d fixup: %u -> %d\n",
 			    i, rss->rss_ind[i], nchan - 1);
 			rss->rss_ind[i] = nchan - 1;
 		}
 	}
 }
 
 static int
 hn_ifmedia_upd(struct ifnet *ifp __unused)
 {
 
 	return EOPNOTSUPP;
 }
 
 static void
 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	struct hn_softc *sc = ifp->if_softc;
 
 	ifmr->ifm_status = IFM_AVALID;
 	ifmr->ifm_active = IFM_ETHER;
 
 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
 		ifmr->ifm_active |= IFM_NONE;
 		return;
 	}
 	ifmr->ifm_status |= IFM_ACTIVE;
 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
 }
 
 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
 static const struct hyperv_guid g_net_vsc_device_type = {
 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
 };
 
 /*
  * Standard probe entry point.
  *
  */
 static int
 netvsc_probe(device_t dev)
 {
 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
 	    &g_net_vsc_device_type) == 0) {
 		device_set_desc(dev, "Hyper-V Network Interface");
 		return BUS_PROBE_DEFAULT;
 	}
 	return ENXIO;
 }
 
 /*
  * Standard attach entry point.
  *
  * Called when the driver is loaded.  It allocates needed resources,
  * and initializes the "hardware" and software.
  */
 static int
 netvsc_attach(device_t dev)
 {
 	struct hn_softc *sc = device_get_softc(dev);
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx;
 	uint8_t eaddr[ETHER_ADDR_LEN];
 	struct ifnet *ifp = NULL;
 	int error, ring_cnt, tx_ring_cnt;
 
 	sc->hn_dev = dev;
 	sc->hn_prichan = vmbus_get_channel(dev);
 	HN_LOCK_INIT(sc);
 
 	/*
 	 * Setup taskqueue for transmission.
 	 */
 	if (hn_tx_taskq == NULL) {
 		sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
 		    taskqueue_thread_enqueue, &sc->hn_tx_taskq);
 		if (hn_bind_tx_taskq >= 0) {
 			int cpu = hn_bind_tx_taskq;
 			cpuset_t cpu_set;
 
 			if (cpu > mp_ncpus - 1)
 				cpu = mp_ncpus - 1;
 			CPU_SETOF(cpu, &cpu_set);
 			taskqueue_start_threads_cpuset(&sc->hn_tx_taskq, 1,
 			    PI_NET, &cpu_set, "%s tx",
 			    device_get_nameunit(dev));
 		} else {
 			taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET,
 			    "%s tx", device_get_nameunit(dev));
 		}
 	} else {
 		sc->hn_tx_taskq = hn_tx_taskq;
 	}
 
 	/*
 	 * Setup taskqueue for mangement tasks, e.g. link status.
 	 */
 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
 	    device_get_nameunit(dev));
 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
 	    hn_netchg_status_taskfunc, sc);
 
 	/*
 	 * Allocate ifnet and setup its name earlier, so that if_printf
 	 * can be used by functions, which will be called after
 	 * ether_ifattach().
 	 */
 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
 	ifp->if_softc = sc;
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 
 	/*
 	 * Initialize ifmedia earlier so that it can be unconditionally
 	 * destroyed, if error happened later on.
 	 */
 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
 
 	/*
 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
 	 * to use (tx_ring_cnt).
 	 *
 	 * NOTE:
 	 * The # of RX rings to use is same as the # of channels to use.
 	 */
 	ring_cnt = hn_chan_cnt;
 	if (ring_cnt <= 0) {
 		/* Default */
 		ring_cnt = mp_ncpus;
 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
 			ring_cnt = HN_RING_CNT_DEF_MAX;
 	} else if (ring_cnt > mp_ncpus) {
 		ring_cnt = mp_ncpus;
 	}
 
 	tx_ring_cnt = hn_tx_ring_cnt;
 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
 		tx_ring_cnt = ring_cnt;
 	if (hn_use_if_start) {
 		/* ifnet.if_start only needs one TX ring. */
 		tx_ring_cnt = 1;
 	}
 
 	/*
 	 * Set the leader CPU for channels.
 	 */
 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
 
 	/*
 	 * Create enough TX/RX rings, even if only limited number of
 	 * channels can be allocated.
 	 */
 	error = hn_create_tx_data(sc, tx_ring_cnt);
 	if (error)
 		goto failed;
 	error = hn_create_rx_data(sc, ring_cnt);
 	if (error)
 		goto failed;
 
 	/*
 	 * Create transaction context for NVS and RNDIS transactions.
 	 */
 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
 	if (sc->hn_xact == NULL)
 		goto failed;
 
 	/*
 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
 	 */
 	error = hn_synth_attach(sc, ETHERMTU);
 	if (error)
 		goto failed;
 
 	error = hn_rndis_get_eaddr(sc, eaddr);
 	if (error)
 		goto failed;
 
 #if __FreeBSD_version >= 1100099
 	if (sc->hn_rx_ring_inuse > 1) {
 		/*
 		 * Reduce TCP segment aggregation limit for multiple
 		 * RX rings to increase ACK timeliness.
 		 */
 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
 	}
 #endif
 
 	/*
 	 * Fixup TX stuffs after synthetic parts are attached.
 	 */
 	hn_fixup_tx_data(sc);
 
 	ctx = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
 	    &sc->hn_nvs_ver, 0, "NVS version");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    hn_ndis_version_sysctl, "A", "NDIS version");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    hn_caps_sysctl, "A", "capabilities");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    hn_hwassist_sysctl, "A", "hwassist");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    hn_rxfilter_sysctl, "A", "rxfilter");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 	    hn_rss_key_sysctl, "IU", "RSS key");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
 
 	/*
 	 * Setup the ifmedia, which has been initialized earlier.
 	 */
 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
 	/* XXX ifmedia_set really should do this for us */
 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
 
 	/*
 	 * Setup the ifnet for this interface.
 	 */
 
+	ifp->if_baudrate = IF_Gbps(10);
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = hn_ioctl;
 	ifp->if_init = hn_init;
 	if (hn_use_if_start) {
 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
 
 		ifp->if_start = hn_start;
 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
 		IFQ_SET_READY(&ifp->if_snd);
 	} else {
 		ifp->if_transmit = hn_transmit;
 		ifp->if_qflush = hn_xmit_qflush;
 	}
 
 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
 #ifdef foo
 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
 #endif
 	if (sc->hn_caps & HN_CAP_VLAN) {
 		/* XXX not sure about VLAN_MTU. */
 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
 	}
 
 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
 		ifp->if_capabilities |= IFCAP_TXCSUM;
 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
 	if (sc->hn_caps & HN_CAP_TSO4) {
 		ifp->if_capabilities |= IFCAP_TSO4;
 		ifp->if_hwassist |= CSUM_IP_TSO;
 	}
 	if (sc->hn_caps & HN_CAP_TSO6) {
 		ifp->if_capabilities |= IFCAP_TSO6;
 		ifp->if_hwassist |= CSUM_IP6_TSO;
 	}
 
 	/* Enable all available capabilities by default. */
 	ifp->if_capenable = ifp->if_capabilities;
 
 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
 	}
 
 	ether_ifattach(ifp, eaddr);
 
 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
 	}
 
 	/* Inform the upper layer about the long frame support. */
 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
 
 	/*
 	 * Kick off link status check.
 	 */
 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
 	hn_link_status_update(sc);
 
 	return (0);
 failed:
 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
 		hn_synth_detach(sc);
 	netvsc_detach(dev);
 	return (error);
 }
 
 static int
 netvsc_detach(device_t dev)
 {
 	struct hn_softc *sc = device_get_softc(dev);
 	struct ifnet *ifp = sc->hn_ifp;
 
 	if (device_is_attached(dev)) {
 		HN_LOCK(sc);
 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				hn_stop(sc);
 			/*
 			 * NOTE:
 			 * hn_stop() only suspends data, so managment
 			 * stuffs have to be suspended manually here.
 			 */
 			hn_suspend_mgmt(sc);
 			hn_synth_detach(sc);
 		}
 		HN_UNLOCK(sc);
 		ether_ifdetach(ifp);
 	}
 
 	ifmedia_removeall(&sc->hn_media);
 	hn_destroy_rx_data(sc);
 	hn_destroy_tx_data(sc);
 
 	if (sc->hn_tx_taskq != hn_tx_taskq)
 		taskqueue_free(sc->hn_tx_taskq);
 	taskqueue_free(sc->hn_mgmt_taskq0);
 
 	if (sc->hn_xact != NULL)
 		vmbus_xact_ctx_destroy(sc->hn_xact);
 
 	if_free(ifp);
 
 	HN_LOCK_DESTROY(sc);
 	return (0);
 }
 
 /*
  * Standard shutdown entry point
  */
 static int
 netvsc_shutdown(device_t dev)
 {
 	return (0);
 }
 
 static void
 hn_link_status(struct hn_softc *sc)
 {
 	uint32_t link_status;
 	int error;
 
 	error = hn_rndis_get_linkstatus(sc, &link_status);
 	if (error) {
 		/* XXX what to do? */
 		return;
 	}
 
 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
 	else
 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
 	if_link_state_change(sc->hn_ifp,
 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
 	    LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
 static void
 hn_link_taskfunc(void *xsc, int pending __unused)
 {
 	struct hn_softc *sc = xsc;
 
 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
 		return;
 	hn_link_status(sc);
 }
 
 static void
 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
 {
 	struct hn_softc *sc = xsc;
 
 	/* Prevent any link status checks from running. */
 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
 
 	/*
 	 * Fake up a [link down --> link up] state change; 5 seconds
 	 * delay is used, which closely simulates miibus reaction
 	 * upon link down event.
 	 */
 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
 	    &sc->hn_netchg_status, 5 * hz);
 }
 
 static void
 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
 {
 	struct hn_softc *sc = xsc;
 
 	/* Re-allow link status checks. */
 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
 	hn_link_status(sc);
 }
 
 void
 hn_link_status_update(struct hn_softc *sc)
 {
 
 	if (sc->hn_mgmt_taskq != NULL)
 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
 }
 
 void
 hn_network_change(struct hn_softc *sc)
 {
 
 	if (sc->hn_mgmt_taskq != NULL)
 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
 }
 
 static __inline int
 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
 {
 	struct mbuf *m = *m_head;
 	int error;
 
 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
 
 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
 	    m, segs, nsegs, BUS_DMA_NOWAIT);
 	if (error == EFBIG) {
 		struct mbuf *m_new;
 
 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
 		if (m_new == NULL)
 			return ENOBUFS;
 		else
 			*m_head = m = m_new;
 		txr->hn_tx_collapsed++;
 
 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
 	}
 	if (!error) {
 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
 		    BUS_DMASYNC_PREWRITE);
 		txd->flags |= HN_TXD_FLAG_DMAMAP;
 	}
 	return error;
 }
 
 static __inline int
 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
 {
 
 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
 	    ("put an onlist txd %#x", txd->flags));
 
 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
 		return 0;
 
 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
 		    ("chim txd uses dmamap"));
 		hn_chim_free(txr->hn_sc, txd->chim_index);
 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
 		bus_dmamap_sync(txr->hn_tx_data_dtag,
 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txr->hn_tx_data_dtag,
 		    txd->data_dmap);
 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
 	}
 
 	if (txd->m != NULL) {
 		m_freem(txd->m);
 		txd->m = NULL;
 	}
 
 	txd->flags |= HN_TXD_FLAG_ONLIST;
 #ifndef HN_USE_TXDESC_BUFRING
 	mtx_lock_spin(&txr->hn_txlist_spin);
 	KASSERT(txr->hn_txdesc_avail >= 0 &&
 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
 	txr->hn_txdesc_avail++;
 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
 	mtx_unlock_spin(&txr->hn_txlist_spin);
 #else
 	atomic_add_int(&txr->hn_txdesc_avail, 1);
 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
 #endif
 
 	return 1;
 }
 
 static __inline struct hn_txdesc *
 hn_txdesc_get(struct hn_tx_ring *txr)
 {
 	struct hn_txdesc *txd;
 
 #ifndef HN_USE_TXDESC_BUFRING
 	mtx_lock_spin(&txr->hn_txlist_spin);
 	txd = SLIST_FIRST(&txr->hn_txlist);
 	if (txd != NULL) {
 		KASSERT(txr->hn_txdesc_avail > 0,
 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
 		txr->hn_txdesc_avail--;
 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
 	}
 	mtx_unlock_spin(&txr->hn_txlist_spin);
 #else
 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
 #endif
 
 	if (txd != NULL) {
 #ifdef HN_USE_TXDESC_BUFRING
 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
 #endif
 		KASSERT(txd->m == NULL && txd->refs == 0 &&
 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
 		txd->refs = 1;
 	}
 	return txd;
 }
 
 static __inline void
 hn_txdesc_hold(struct hn_txdesc *txd)
 {
 
 	/* 0->1 transition will never work */
 	KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
 	atomic_add_int(&txd->refs, 1);
 }
 
 static bool
 hn_tx_ring_pending(struct hn_tx_ring *txr)
 {
 	bool pending = false;
 
 #ifndef HN_USE_TXDESC_BUFRING
 	mtx_lock_spin(&txr->hn_txlist_spin);
 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
 		pending = true;
 	mtx_unlock_spin(&txr->hn_txlist_spin);
 #else
 	if (!buf_ring_full(txr->hn_txdesc_br))
 		pending = true;
 #endif
 	return (pending);
 }
 
 static __inline void
 hn_txeof(struct hn_tx_ring *txr)
 {
 	txr->hn_has_txeof = 0;
 	txr->hn_txeof(txr);
 }
 
 static void
 hn_tx_done(struct hn_send_ctx *sndc, struct hn_softc *sc,
     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
 {
 	struct hn_txdesc *txd = sndc->hn_cbarg;
 	struct hn_tx_ring *txr;
 
 	txr = txd->txr;
 	KASSERT(txr->hn_chan == chan,
 	    ("channel mismatch, on chan%u, should be chan%u",
 	     vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
 
 	txr->hn_has_txeof = 1;
 	hn_txdesc_put(txr, txd);
 
 	++txr->hn_txdone_cnt;
 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
 		txr->hn_txdone_cnt = 0;
 		if (txr->hn_oactive)
 			hn_txeof(txr);
 	}
 }
 
 void
 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
 {
 #if defined(INET) || defined(INET6)
 	tcp_lro_flush_all(&rxr->hn_lro);
 #endif
 
 	/*
 	 * NOTE:
 	 * 'txr' could be NULL, if multiple channels and
 	 * ifnet.if_start method are enabled.
 	 */
 	if (txr == NULL || !txr->hn_has_txeof)
 		return;
 
 	txr->hn_txdone_cnt = 0;
 	hn_txeof(txr);
 }
 
 static __inline uint32_t
 hn_rndis_pktmsg_offset(uint32_t ofs)
 {
 
 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
 	    ("invalid RNDIS packet msg offset %u", ofs));
 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
 }
 
 /*
  * NOTE:
  * If this function fails, then both txd and m_head0 will be freed.
  */
 static int
 hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
 {
 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
 	int error, nsegs, i;
 	struct mbuf *m_head = *m_head0;
 	struct rndis_packet_msg *pkt;
 	uint32_t *pi_data;
 	int pktlen;
 
 	/*
 	 * extension points to the area reserved for the
 	 * rndis_filter_packet, which is placed just after
 	 * the netvsc_packet (and rppi struct, if present;
 	 * length is updated later).
 	 */
 	pkt = txd->rndis_pkt;
 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
 	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
 	pkt->rm_dataoffset = sizeof(*pkt);
 	pkt->rm_datalen = m_head->m_pkthdr.len;
 	pkt->rm_pktinfooffset = sizeof(*pkt);
 	pkt->rm_pktinfolen = 0;
 
 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
 		/*
 		 * Set the hash value for this packet, so that the host could
 		 * dispatch the TX done event for this packet back to this TX
 		 * ring's channel.
 		 */
 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
 		*pi_data = txr->hn_tx_idx;
 	}
 
 	if (m_head->m_flags & M_VLANTAG) {
 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
 		*pi_data = NDIS_VLAN_INFO_MAKE(
 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
 	}
 
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 #if defined(INET6) || defined(INET)
 		struct ether_vlan_header *eh;
 		int ether_len;
 
 		/*
 		 * XXX need m_pullup and use mtodo
 		 */
 		eh = mtod(m_head, struct ether_vlan_header*);
 		if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
 			ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 		else
 			ether_len = ETHER_HDR_LEN;
 
 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
 #ifdef INET
 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
 			struct ip *ip =
 			    (struct ip *)(m_head->m_data + ether_len);
 			unsigned long iph_len = ip->ip_hl << 2;
 			struct tcphdr *th =
 			    (struct tcphdr *)((caddr_t)ip + iph_len);
 
 			ip->ip_len = 0;
 			ip->ip_sum = 0;
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 			    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
 			    m_head->m_pkthdr.tso_segsz);
 		}
 #endif
 #if defined(INET6) && defined(INET)
 		else
 #endif
 #ifdef INET6
 		{
 			struct ip6_hdr *ip6 = (struct ip6_hdr *)
 			    (m_head->m_data + ether_len);
 			struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
 
 			ip6->ip6_plen = 0;
 			th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
 			    m_head->m_pkthdr.tso_segsz);
 		}
 #endif
 #endif	/* INET6 || INET */
 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
 		if (m_head->m_pkthdr.csum_flags &
 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
 		} else {
 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
 		}
 
 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
 		else if (m_head->m_pkthdr.csum_flags &
 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
 	}
 
 	pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
 	/* Convert RNDIS packet message offsets */
 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
 
 	/*
 	 * Chimney send, if the packet could fit into one chimney buffer.
 	 */
 	if (pkt->rm_len < txr->hn_chim_size) {
 		txr->hn_tx_chimney_tried++;
 		txd->chim_index = hn_chim_alloc(txr->hn_sc);
 		if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
 			uint8_t *dest = txr->hn_sc->hn_chim +
 			    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
 
 			memcpy(dest, pkt, pktlen);
 			dest += pktlen;
 			m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
 
 			txd->chim_size = pkt->rm_len;
 			txr->hn_gpa_cnt = 0;
 			txr->hn_tx_chimney++;
 			txr->hn_sendpkt = hn_sendpkt_rndis_chim;
 			goto done;
 		}
 	}
 
 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
 	if (error) {
 		int freed;
 
 		/*
 		 * This mbuf is not linked w/ the txd yet, so free it now.
 		 */
 		m_freem(m_head);
 		*m_head0 = NULL;
 
 		freed = hn_txdesc_put(txr, txd);
 		KASSERT(freed != 0,
 		    ("fail to free txd upon txdma error"));
 
 		txr->hn_txdma_failed++;
 		if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
 		return error;
 	}
 	*m_head0 = m_head;
 
 	/* +1 RNDIS packet message */
 	txr->hn_gpa_cnt = nsegs + 1;
 
 	/* send packet with page buffer */
 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
 	txr->hn_gpa[0].gpa_len = pktlen;
 
 	/*
 	 * Fill the page buffers with mbuf info after the page
 	 * buffer for RNDIS packet message.
 	 */
 	for (i = 0; i < nsegs; ++i) {
 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
 
 		gpa->gpa_page = atop(segs[i].ds_addr);
 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
 		gpa->gpa_len = segs[i].ds_len;
 	}
 
 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
 	txd->chim_size = 0;
 	txr->hn_sendpkt = hn_sendpkt_rndis_sglist;
 done:
 	txd->m = m_head;
 
 	/* Set the completion routine */
 	hn_send_ctx_init(&txd->send_ctx, hn_tx_done, txd);
 
 	return 0;
 }
 
 /*
  * NOTE:
  * If this function fails, then txd will be freed, but the mbuf
  * associated w/ the txd will _not_ be freed.
  */
 static int
 hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
 {
 	int error, send_failed = 0;
 
 again:
 	/*
 	 * Make sure that txd is not freed before ETHER_BPF_MTAP.
 	 */
 	hn_txdesc_hold(txd);
 	error = txr->hn_sendpkt(txr, txd);
 	if (!error) {
 		ETHER_BPF_MTAP(ifp, txd->m);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if (!hn_use_if_start) {
 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
 			    txd->m->m_pkthdr.len);
 			if (txd->m->m_flags & M_MCAST)
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		}
 		txr->hn_pkts++;
 	}
 	hn_txdesc_put(txr, txd);
 
 	if (__predict_false(error)) {
 		int freed;
 
 		/*
 		 * This should "really rarely" happen.
 		 *
 		 * XXX Too many RX to be acked or too many sideband
 		 * commands to run?  Ask netvsc_channel_rollup()
 		 * to kick start later.
 		 */
 		txr->hn_has_txeof = 1;
 		if (!send_failed) {
 			txr->hn_send_failed++;
 			send_failed = 1;
 			/*
 			 * Try sending again after set hn_has_txeof;
 			 * in case that we missed the last
 			 * netvsc_channel_rollup().
 			 */
 			goto again;
 		}
 		if_printf(ifp, "send failed\n");
 
 		/*
 		 * Caller will perform further processing on the
 		 * associated mbuf, so don't free it in hn_txdesc_put();
 		 * only unload it from the DMA map in hn_txdesc_put(),
 		 * if it was loaded.
 		 */
 		txd->m = NULL;
 		freed = hn_txdesc_put(txr, txd);
 		KASSERT(freed != 0,
 		    ("fail to free txd upon send error"));
 
 		txr->hn_send_failed++;
 	}
 	return error;
 }
 
 /*
  * Start a transmit of one or more packets
  */
 static int
 hn_start_locked(struct hn_tx_ring *txr, int len)
 {
 	struct hn_softc *sc = txr->hn_sc;
 	struct ifnet *ifp = sc->hn_ifp;
 
 	KASSERT(hn_use_if_start,
 	    ("hn_start_locked is called, when if_start is disabled"));
 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
 
 	if (__predict_false(txr->hn_suspended))
 		return 0;
 
 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return 0;
 
 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
 		struct hn_txdesc *txd;
 		struct mbuf *m_head;
 		int error;
 
 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
 		if (m_head == NULL)
 			break;
 
 		if (len > 0 && m_head->m_pkthdr.len > len) {
 			/*
 			 * This sending could be time consuming; let callers
 			 * dispatch this packet sending (and sending of any
 			 * following up packets) to tx taskqueue.
 			 */
 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
 			return 1;
 		}
 
 		txd = hn_txdesc_get(txr);
 		if (txd == NULL) {
 			txr->hn_no_txdescs++;
 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 			break;
 		}
 
 		error = hn_encap(txr, txd, &m_head);
 		if (error) {
 			/* Both txd and m_head are freed */
 			continue;
 		}
 
 		error = hn_send_pkt(ifp, txr, txd);
 		if (__predict_false(error)) {
 			/* txd is freed, but m_head is not */
 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 			break;
 		}
 	}
 	return 0;
 }
 
 /*
  * Append the specified data to the indicated mbuf chain,
  * Extend the mbuf chain if the new data does not fit in
  * existing space.
  *
  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
  * There should be an equivalent in the kernel mbuf code,
  * but there does not appear to be one yet.
  *
  * Differs from m_append() in that additional mbufs are
  * allocated with cluster size MJUMPAGESIZE, and filled
  * accordingly.
  *
  * Return 1 if able to complete the job; otherwise 0.
  */
 static int
 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
 {
 	struct mbuf *m, *n;
 	int remainder, space;
 
 	for (m = m0; m->m_next != NULL; m = m->m_next)
 		;
 	remainder = len;
 	space = M_TRAILINGSPACE(m);
 	if (space > 0) {
 		/*
 		 * Copy into available space.
 		 */
 		if (space > remainder)
 			space = remainder;
 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
 		m->m_len += space;
 		cp += space;
 		remainder -= space;
 	}
 	while (remainder > 0) {
 		/*
 		 * Allocate a new mbuf; could check space
 		 * and allocate a cluster instead.
 		 */
 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
 		if (n == NULL)
 			break;
 		n->m_len = min(MJUMPAGESIZE, remainder);
 		bcopy(cp, mtod(n, caddr_t), n->m_len);
 		cp += n->m_len;
 		remainder -= n->m_len;
 		m->m_next = n;
 		m = n;
 	}
 	if (m0->m_flags & M_PKTHDR)
 		m0->m_pkthdr.len += len - remainder;
 
 	return (remainder == 0);
 }
 
 #if defined(INET) || defined(INET6)
 static __inline int
 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
 {
 #if __FreeBSD_version >= 1100095
 	if (hn_lro_mbufq_depth) {
 		tcp_lro_queue_mbuf(lc, m);
 		return 0;
 	}
 #endif
 	return tcp_lro_rx(lc, m, 0);
 }
 #endif
 
 /*
  * Called when we receive a data packet from the "wire" on the
  * specified device
  *
  * Note:  This is no longer used as a callback
  */
 int
 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
     const struct hn_recvinfo *info)
 {
 	struct ifnet *ifp = rxr->hn_ifp;
 	struct mbuf *m_new;
 	int size, do_lro = 0, do_csum = 1;
 	int hash_type;
 
 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
 		return (0);
 
 	/*
 	 * Bail out if packet contains more data than configured MTU.
 	 */
 	if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
 		return (0);
 	} else if (dlen <= MHLEN) {
 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m_new == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			return (0);
 		}
 		memcpy(mtod(m_new, void *), data, dlen);
 		m_new->m_pkthdr.len = m_new->m_len = dlen;
 		rxr->hn_small_pkts++;
 	} else {
 		/*
 		 * Get an mbuf with a cluster.  For packets 2K or less,
 		 * get a standard 2K cluster.  For anything larger, get a
 		 * 4K cluster.  Any buffers larger than 4K can cause problems
 		 * if looped around to the Hyper-V TX channel, so avoid them.
 		 */
 		size = MCLBYTES;
 		if (dlen > MCLBYTES) {
 			/* 4096 */
 			size = MJUMPAGESIZE;
 		}
 
 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
 		if (m_new == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			return (0);
 		}
 
 		hv_m_append(m_new, dlen, data);
 	}
 	m_new->m_pkthdr.rcvif = ifp;
 
 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
 		do_csum = 0;
 
 	/* receive side checksum offload */
 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
 		/* IP csum offload */
 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
 			m_new->m_pkthdr.csum_flags |=
 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
 			rxr->hn_csum_ip++;
 		}
 
 		/* TCP/UDP csum offload */
 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
 			m_new->m_pkthdr.csum_flags |=
 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			m_new->m_pkthdr.csum_data = 0xffff;
 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
 				rxr->hn_csum_tcp++;
 			else
 				rxr->hn_csum_udp++;
 		}
 
 		if ((info->csum_info &
 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
 			do_lro = 1;
 	} else {
 		const struct ether_header *eh;
 		uint16_t etype;
 		int hoff;
 
 		hoff = sizeof(*eh);
 		if (m_new->m_len < hoff)
 			goto skip;
 		eh = mtod(m_new, struct ether_header *);
 		etype = ntohs(eh->ether_type);
 		if (etype == ETHERTYPE_VLAN) {
 			const struct ether_vlan_header *evl;
 
 			hoff = sizeof(*evl);
 			if (m_new->m_len < hoff)
 				goto skip;
 			evl = mtod(m_new, struct ether_vlan_header *);
 			etype = ntohs(evl->evl_proto);
 		}
 
 		if (etype == ETHERTYPE_IP) {
 			int pr;
 
 			pr = hn_check_iplen(m_new, hoff);
 			if (pr == IPPROTO_TCP) {
 				if (do_csum &&
 				    (rxr->hn_trust_hcsum &
 				     HN_TRUST_HCSUM_TCP)) {
 					rxr->hn_csum_trusted++;
 					m_new->m_pkthdr.csum_flags |=
 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 					m_new->m_pkthdr.csum_data = 0xffff;
 				}
 				do_lro = 1;
 			} else if (pr == IPPROTO_UDP) {
 				if (do_csum &&
 				    (rxr->hn_trust_hcsum &
 				     HN_TRUST_HCSUM_UDP)) {
 					rxr->hn_csum_trusted++;
 					m_new->m_pkthdr.csum_flags |=
 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 					m_new->m_pkthdr.csum_data = 0xffff;
 				}
 			} else if (pr != IPPROTO_DONE && do_csum &&
 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
 				rxr->hn_csum_trusted++;
 				m_new->m_pkthdr.csum_flags |=
 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
 			}
 		}
 	}
 skip:
 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
 		    NDIS_VLAN_INFO_ID(info->vlan_info),
 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
 		m_new->m_flags |= M_VLANTAG;
 	}
 
 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
 		rxr->hn_rss_pkts++;
 		m_new->m_pkthdr.flowid = info->hash_value;
 		hash_type = M_HASHTYPE_OPAQUE_HASH;
 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
 
 			switch (type) {
 			case NDIS_HASH_IPV4:
 				hash_type = M_HASHTYPE_RSS_IPV4;
 				break;
 
 			case NDIS_HASH_TCP_IPV4:
 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
 				break;
 
 			case NDIS_HASH_IPV6:
 				hash_type = M_HASHTYPE_RSS_IPV6;
 				break;
 
 			case NDIS_HASH_IPV6_EX:
 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
 				break;
 
 			case NDIS_HASH_TCP_IPV6:
 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
 				break;
 
 			case NDIS_HASH_TCP_IPV6_EX:
 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
 				break;
 			}
 		}
 	} else {
 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
 		hash_type = M_HASHTYPE_OPAQUE;
 	}
 	M_HASHTYPE_SET(m_new, hash_type);
 
 	/*
 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
 	 * messages (not just data messages) will trigger a response.
 	 */
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	rxr->hn_pkts++;
 
 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
 #if defined(INET) || defined(INET6)
 		struct lro_ctrl *lro = &rxr->hn_lro;
 
 		if (lro->lro_cnt) {
 			rxr->hn_lro_tried++;
 			if (hn_lro_rx(lro, m_new) == 0) {
 				/* DONE! */
 				return 0;
 			}
 		}
 #endif
 	}
 
 	/* We're not holding the lock here, so don't release it */
 	(*ifp->if_input)(ifp, m_new);
 
 	return (0);
 }
 
 static int
 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct hn_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	int mask, error = 0;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
 			error = EINVAL;
 			break;
 		}
 
 		HN_LOCK(sc);
 
 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
 			HN_UNLOCK(sc);
 			break;
 		}
 
 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
 			/* Can't change MTU */
 			HN_UNLOCK(sc);
 			error = EOPNOTSUPP;
 			break;
 		}
 
 		if (ifp->if_mtu == ifr->ifr_mtu) {
 			HN_UNLOCK(sc);
 			break;
 		}
 
 		/*
 		 * Suspend this interface before the synthetic parts
 		 * are ripped.
 		 */
 		hn_suspend(sc);
 
 		/*
 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
 		 */
 		hn_synth_detach(sc);
 
 		/*
 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
 		 * with the new MTU setting.
 		 */
 		error = hn_synth_attach(sc, ifr->ifr_mtu);
 		if (error) {
 			HN_UNLOCK(sc);
 			break;
 		}
 
 		/*
 		 * Commit the requested MTU, after the synthetic parts
 		 * have been successfully attached.
 		 */
 		ifp->if_mtu = ifr->ifr_mtu;
 
 		/*
 		 * Make sure that various parameters based on MTU are
 		 * still valid, after the MTU change.
 		 */
 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
 			hn_set_chim_size(sc, sc->hn_chim_szmax);
 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
 #if __FreeBSD_version >= 1100099
 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
 		    HN_LRO_LENLIM_MIN(ifp))
 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
 #endif
 
 		/*
 		 * All done!  Resume the interface now.
 		 */
 		hn_resume(sc);
 
 		HN_UNLOCK(sc);
 		break;
 
 	case SIOCSIFFLAGS:
 		HN_LOCK(sc);
 
 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
 			HN_UNLOCK(sc);
 			break;
 		}
 
 		if (ifp->if_flags & IFF_UP) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				hn_set_rxfilter(sc);
 			else
 				hn_init_locked(sc);
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				hn_stop(sc);
 		}
 		sc->hn_if_flags = ifp->if_flags;
 
 		HN_UNLOCK(sc);
 		break;
 
 	case SIOCSIFCAP:
 		HN_LOCK(sc);
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 
 		if (mask & IFCAP_TXCSUM) {
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 			if (ifp->if_capenable & IFCAP_TXCSUM)
 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
 			else
 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
 			else
 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
 		}
 
 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
 		if (mask & IFCAP_RXCSUM)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 #ifdef foo
 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
 		if (mask & IFCAP_RXCSUM_IPV6)
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 #endif
 
 		if (mask & IFCAP_LRO)
 			ifp->if_capenable ^= IFCAP_LRO;
 
 		if (mask & IFCAP_TSO4) {
 			ifp->if_capenable ^= IFCAP_TSO4;
 			if (ifp->if_capenable & IFCAP_TSO4)
 				ifp->if_hwassist |= CSUM_IP_TSO;
 			else
 				ifp->if_hwassist &= ~CSUM_IP_TSO;
 		}
 		if (mask & IFCAP_TSO6) {
 			ifp->if_capenable ^= IFCAP_TSO6;
 			if (ifp->if_capenable & IFCAP_TSO6)
 				ifp->if_hwassist |= CSUM_IP6_TSO;
 			else
 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
 		}
 
 		HN_UNLOCK(sc);
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 #ifdef notyet
 		/*
 		 * XXX
 		 * Multicast uses mutex, while RNDIS RX filter setting
 		 * sleeps.  We workaround this by always enabling
 		 * ALLMULTI.  ALLMULTI would actually always be on, even
 		 * if we supported the SIOCADDMULTI/SIOCDELMULTI, since
 		 * we don't support multicast address list configuration
 		 * for this driver.
 		 */
 		HN_LOCK(sc);
 
 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
 			HN_UNLOCK(sc);
 			break;
 		}
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 			hn_set_rxfilter(sc);
 
 		HN_UNLOCK(sc);
 #endif
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 	return (error);
 }
 
 static void
 hn_stop(struct hn_softc *sc)
 {
 	struct ifnet *ifp = sc->hn_ifp;
 	int i;
 
 	HN_LOCK_ASSERT(sc);
 
 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
 	    ("synthetic parts were not attached"));
 
 	/* Clear RUNNING bit _before_ hn_suspend_data() */
 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
 	hn_suspend_data(sc);
 
 	/* Clear OACTIVE bit. */
 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
 		sc->hn_tx_ring[i].hn_oactive = 0;
 }
 
 /*
  * FreeBSD transmit entry point
  */
 static void
 hn_start(struct ifnet *ifp)
 {
 	struct hn_softc *sc = ifp->if_softc;
 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
 
 	if (txr->hn_sched_tx)
 		goto do_sched;
 
 	if (mtx_trylock(&txr->hn_tx_lock)) {
 		int sched;
 
 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
 		mtx_unlock(&txr->hn_tx_lock);
 		if (!sched)
 			return;
 	}
 do_sched:
 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
 }
 
 static void
 hn_start_txeof(struct hn_tx_ring *txr)
 {
 	struct hn_softc *sc = txr->hn_sc;
 	struct ifnet *ifp = sc->hn_ifp;
 
 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
 
 	if (txr->hn_sched_tx)
 		goto do_sched;
 
 	if (mtx_trylock(&txr->hn_tx_lock)) {
 		int sched;
 
 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
 		mtx_unlock(&txr->hn_tx_lock);
 		if (sched) {
 			taskqueue_enqueue(txr->hn_tx_taskq,
 			    &txr->hn_tx_task);
 		}
 	} else {
 do_sched:
 		/*
 		 * Release the OACTIVE earlier, with the hope, that
 		 * others could catch up.  The task will clear the
 		 * flag again with the hn_tx_lock to avoid possible
 		 * races.
 		 */
 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
 	}
 }
 
 static void
 hn_init_locked(struct hn_softc *sc)
 {
 	struct ifnet *ifp = sc->hn_ifp;
 	int i;
 
 	HN_LOCK_ASSERT(sc);
 
 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
 		return;
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return;
 
 	/* Configure RX filter */
 	hn_set_rxfilter(sc);
 
 	/* Clear OACTIVE bit. */
 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
 		sc->hn_tx_ring[i].hn_oactive = 0;
 
 	/* Clear TX 'suspended' bit. */
 	hn_tx_resume(sc, sc->hn_tx_ring_inuse);
 
 	/* Everything is ready; unleash! */
 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
 }
 
 static void
 hn_init(void *xsc)
 {
 	struct hn_softc *sc = xsc;
 
 	HN_LOCK(sc);
 	hn_init_locked(sc);
 	HN_UNLOCK(sc);
 }
 
 #ifdef LATER
 /*
  *
  */
 static void
 hn_watchdog(struct ifnet *ifp)
 {
 
 	if_printf(ifp, "watchdog timeout -- resetting\n");
 	hn_init(ifp->if_softc);    /* XXX */
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 }
 #endif
 
 #if __FreeBSD_version >= 1100099
 
 static int
 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	unsigned int lenlim;
 	int error;
 
 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	HN_LOCK(sc);
 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
 	    lenlim > TCP_LRO_LENGTH_MAX) {
 		HN_UNLOCK(sc);
 		return EINVAL;
 	}
 	hn_set_lro_lenlim(sc, lenlim);
 	HN_UNLOCK(sc);
 
 	return 0;
 }
 
 static int
 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int ackcnt, error, i;
 
 	/*
 	 * lro_ackcnt_lim is append count limit,
 	 * +1 to turn it into aggregation limit.
 	 */
 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
 		return EINVAL;
 
 	/*
 	 * Convert aggregation limit back to append
 	 * count limit.
 	 */
 	--ackcnt;
 	HN_LOCK(sc);
 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
 	HN_UNLOCK(sc);
 	return 0;
 }
 
 #endif
 
 static int
 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int hcsum = arg2;
 	int on, error, i;
 
 	on = 0;
 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
 		on = 1;
 
 	error = sysctl_handle_int(oidp, &on, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	HN_LOCK(sc);
 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 
 		if (on)
 			rxr->hn_trust_hcsum |= hcsum;
 		else
 			rxr->hn_trust_hcsum &= ~hcsum;
 	}
 	HN_UNLOCK(sc);
 	return 0;
 }
 
 static int
 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int chim_size, error;
 
 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
 		return EINVAL;
 
 	HN_LOCK(sc);
 	hn_set_chim_size(sc, chim_size);
 	HN_UNLOCK(sc);
 	return 0;
 }
 
 #if __FreeBSD_version < 1100095
 static int
 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int ofs = arg2, i, error;
 	struct hn_rx_ring *rxr;
 	uint64_t stat;
 
 	stat = 0;
 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		stat += *((int *)((uint8_t *)rxr + ofs));
 	}
 
 	error = sysctl_handle_64(oidp, &stat, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	/* Zero out this stat. */
 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		*((int *)((uint8_t *)rxr + ofs)) = 0;
 	}
 	return 0;
 }
 #else
 static int
 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int ofs = arg2, i, error;
 	struct hn_rx_ring *rxr;
 	uint64_t stat;
 
 	stat = 0;
 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
 	}
 
 	error = sysctl_handle_64(oidp, &stat, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	/* Zero out this stat. */
 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
 	}
 	return 0;
 }
 
 #endif
 
 static int
 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int ofs = arg2, i, error;
 	struct hn_rx_ring *rxr;
 	u_long stat;
 
 	stat = 0;
 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		stat += *((u_long *)((uint8_t *)rxr + ofs));
 	}
 
 	error = sysctl_handle_long(oidp, &stat, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	/* Zero out this stat. */
 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
 	}
 	return 0;
 }
 
 static int
 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int ofs = arg2, i, error;
 	struct hn_tx_ring *txr;
 	u_long stat;
 
 	stat = 0;
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		txr = &sc->hn_tx_ring[i];
 		stat += *((u_long *)((uint8_t *)txr + ofs));
 	}
 
 	error = sysctl_handle_long(oidp, &stat, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	/* Zero out this stat. */
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		txr = &sc->hn_tx_ring[i];
 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
 	}
 	return 0;
 }
 
 static int
 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int ofs = arg2, i, error, conf;
 	struct hn_tx_ring *txr;
 
 	txr = &sc->hn_tx_ring[0];
 	conf = *((int *)((uint8_t *)txr + ofs));
 
 	error = sysctl_handle_int(oidp, &conf, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
 	HN_LOCK(sc);
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		txr = &sc->hn_tx_ring[i];
 		*((int *)((uint8_t *)txr + ofs)) = conf;
 	}
 	HN_UNLOCK(sc);
 
 	return 0;
 }
 
 static int
 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	char verstr[16];
 
 	snprintf(verstr, sizeof(verstr), "%u.%u",
 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
 }
 
 static int
 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	char caps_str[128];
 	uint32_t caps;
 
 	HN_LOCK(sc);
 	caps = sc->hn_caps;
 	HN_UNLOCK(sc);
 	snprintf(caps_str, sizeof(caps_str), "%b", caps,
 	    "\020"
 	    "\001VLAN"
 	    "\002MTU"
 	    "\003IPCS"
 	    "\004TCP4CS"
 	    "\005TCP6CS"
 	    "\006UDP4CS"
 	    "\007UDP6CS"
 	    "\010TSO4"
 	    "\011TSO6"
 	    "\012HASHVAL");
 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
 }
 
 static int
 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	char assist_str[128];
 	uint32_t hwassist;
 
 	HN_LOCK(sc);
 	hwassist = sc->hn_ifp->if_hwassist;
 	HN_UNLOCK(sc);
 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
 }
 
 static int
 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	char filter_str[128];
 	uint32_t filter;
 
 	HN_LOCK(sc);
 	filter = sc->hn_rx_filter;
 	HN_UNLOCK(sc);
 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
 	    NDIS_PACKET_TYPES);
 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
 }
 
 static int
 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int error;
 
 	HN_LOCK(sc);
 
 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
 	if (error || req->newptr == NULL)
 		goto back;
 
 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
 	if (error)
 		goto back;
 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
 
 	if (sc->hn_rx_ring_inuse > 1) {
 		error = hn_rss_reconfig(sc);
 	} else {
 		/* Not RSS capable, at least for now; just save the RSS key. */
 		error = 0;
 	}
 back:
 	HN_UNLOCK(sc);
 	return (error);
 }
 
 static int
 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	int error;
 
 	HN_LOCK(sc);
 
 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
 	if (error || req->newptr == NULL)
 		goto back;
 
 	/*
 	 * Don't allow RSS indirect table change, if this interface is not
 	 * RSS capable currently.
 	 */
 	if (sc->hn_rx_ring_inuse == 1) {
 		error = EOPNOTSUPP;
 		goto back;
 	}
 
 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
 	if (error)
 		goto back;
 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
 
 	hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
 	error = hn_rss_reconfig(sc);
 back:
 	HN_UNLOCK(sc);
 	return (error);
 }
 
 static int
 hn_check_iplen(const struct mbuf *m, int hoff)
 {
 	const struct ip *ip;
 	int len, iphlen, iplen;
 	const struct tcphdr *th;
 	int thoff;				/* TCP data offset */
 
 	len = hoff + sizeof(struct ip);
 
 	/* The packet must be at least the size of an IP header. */
 	if (m->m_pkthdr.len < len)
 		return IPPROTO_DONE;
 
 	/* The fixed IP header must reside completely in the first mbuf. */
 	if (m->m_len < len)
 		return IPPROTO_DONE;
 
 	ip = mtodo(m, hoff);
 
 	/* Bound check the packet's stated IP header length. */
 	iphlen = ip->ip_hl << 2;
 	if (iphlen < sizeof(struct ip))		/* minimum header length */
 		return IPPROTO_DONE;
 
 	/* The full IP header must reside completely in the one mbuf. */
 	if (m->m_len < hoff + iphlen)
 		return IPPROTO_DONE;
 
 	iplen = ntohs(ip->ip_len);
 
 	/*
 	 * Check that the amount of data in the buffers is as
 	 * at least much as the IP header would have us expect.
 	 */
 	if (m->m_pkthdr.len < hoff + iplen)
 		return IPPROTO_DONE;
 
 	/*
 	 * Ignore IP fragments.
 	 */
 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
 		return IPPROTO_DONE;
 
 	/*
 	 * The TCP/IP or UDP/IP header must be entirely contained within
 	 * the first fragment of a packet.
 	 */
 	switch (ip->ip_p) {
 	case IPPROTO_TCP:
 		if (iplen < iphlen + sizeof(struct tcphdr))
 			return IPPROTO_DONE;
 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
 			return IPPROTO_DONE;
 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
 		thoff = th->th_off << 2;
 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
 			return IPPROTO_DONE;
 		if (m->m_len < hoff + iphlen + thoff)
 			return IPPROTO_DONE;
 		break;
 	case IPPROTO_UDP:
 		if (iplen < iphlen + sizeof(struct udphdr))
 			return IPPROTO_DONE;
 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
 			return IPPROTO_DONE;
 		break;
 	default:
 		if (iplen < iphlen)
 			return IPPROTO_DONE;
 		break;
 	}
 	return ip->ip_p;
 }
 
 static int
 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
 {
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx;
 	device_t dev = sc->hn_dev;
 #if defined(INET) || defined(INET6)
 #if __FreeBSD_version >= 1100095
 	int lroent_cnt;
 #endif
 #endif
 	int i;
 
 	/*
 	 * Create RXBUF for reception.
 	 *
 	 * NOTE:
 	 * - It is shared by all channels.
 	 * - A large enough buffer is allocated, certain version of NVSes
 	 *   may further limit the usable space.
 	 */
 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
 	    PAGE_SIZE, 0, NETVSC_RECEIVE_BUFFER_SIZE, &sc->hn_rxbuf_dma,
 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
 	if (sc->hn_rxbuf == NULL) {
 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
 		return (ENOMEM);
 	}
 
 	sc->hn_rx_ring_cnt = ring_cnt;
 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
 
 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
 	    M_NETVSC, M_WAITOK | M_ZERO);
 
 #if defined(INET) || defined(INET6)
 #if __FreeBSD_version >= 1100095
 	lroent_cnt = hn_lro_entry_count;
 	if (lroent_cnt < TCP_LRO_ENTRIES)
 		lroent_cnt = TCP_LRO_ENTRIES;
 	if (bootverbose)
 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
 #endif
 #endif	/* INET || INET6 */
 
 	ctx = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 
 	/* Create dev.hn.UNIT.rx sysctl tree */
 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 
 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
 		    PAGE_SIZE, 0,
 		    NETVSC_DEVICE_RING_BUFFER_SIZE +
 		    NETVSC_DEVICE_RING_BUFFER_SIZE,
 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
 		if (rxr->hn_br == NULL) {
 			device_printf(dev, "allocate bufring failed\n");
 			return (ENOMEM);
 		}
 
 		if (hn_trust_hosttcp)
 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
 		if (hn_trust_hostudp)
 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
 		if (hn_trust_hostip)
 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
 		rxr->hn_ifp = sc->hn_ifp;
 		if (i < sc->hn_tx_ring_cnt)
 			rxr->hn_txr = &sc->hn_tx_ring[i];
 		rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK);
 		rxr->hn_rx_idx = i;
 		rxr->hn_rxbuf = sc->hn_rxbuf;
 
 		/*
 		 * Initialize LRO.
 		 */
 #if defined(INET) || defined(INET6)
 #if __FreeBSD_version >= 1100095
 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
 		    hn_lro_mbufq_depth);
 #else
 		tcp_lro_init(&rxr->hn_lro);
 		rxr->hn_lro.ifp = sc->hn_ifp;
 #endif
 #if __FreeBSD_version >= 1100099
 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
 #endif
 #endif	/* INET || INET6 */
 
 		if (sc->hn_rx_sysctl_tree != NULL) {
 			char name[16];
 
 			/*
 			 * Create per RX ring sysctl tree:
 			 * dev.hn.UNIT.rx.RINGID
 			 */
 			snprintf(name, sizeof(name), "%d", i);
 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 			if (rxr->hn_rx_sysctl_tree != NULL) {
 				SYSCTL_ADD_ULONG(ctx,
 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
 				    OID_AUTO, "packets", CTLFLAG_RW,
 				    &rxr->hn_pkts, "# of packets received");
 				SYSCTL_ADD_ULONG(ctx,
 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
 				    &rxr->hn_rss_pkts,
 				    "# of packets w/ RSS info received");
 			}
 		}
 	}
 
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
 #if __FreeBSD_version < 1100095
 	    hn_rx_stat_int_sysctl,
 #else
 	    hn_rx_stat_u64_sysctl,
 #endif
 	    "LU", "LRO queued");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
 #if __FreeBSD_version < 1100095
 	    hn_rx_stat_int_sysctl,
 #else
 	    hn_rx_stat_u64_sysctl,
 #endif
 	    "LU", "LRO flushed");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
 #if __FreeBSD_version >= 1100099
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 	    hn_lro_lenlim_sysctl, "IU",
 	    "Max # of data bytes to be aggregated by LRO");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 	    hn_lro_ackcnt_sysctl, "I",
 	    "Max # of ACKs to be aggregated by LRO");
 #endif
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
 	    hn_trust_hcsum_sysctl, "I",
 	    "Trust tcp segement verification on host side, "
 	    "when csum info is missing");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
 	    hn_trust_hcsum_sysctl, "I",
 	    "Trust udp datagram verification on host side, "
 	    "when csum info is missing");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
 	    hn_trust_hcsum_sysctl, "I",
 	    "Trust ip packet verification on host side, "
 	    "when csum info is missing");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
 	    hn_rx_stat_ulong_sysctl, "LU",
 	    "# of packets that we trust host's csum verification");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
 
 	return (0);
 }
 
 static void
 hn_destroy_rx_data(struct hn_softc *sc)
 {
 	int i;
 
 	if (sc->hn_rxbuf != NULL) {
 		hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
 		sc->hn_rxbuf = NULL;
 	}
 
 	if (sc->hn_rx_ring_cnt == 0)
 		return;
 
 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 
 		if (rxr->hn_br == NULL)
 			continue;
 		hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
 		rxr->hn_br = NULL;
 
 #if defined(INET) || defined(INET6)
 		tcp_lro_free(&rxr->hn_lro);
 #endif
 		free(rxr->hn_rdbuf, M_NETVSC);
 	}
 	free(sc->hn_rx_ring, M_NETVSC);
 	sc->hn_rx_ring = NULL;
 
 	sc->hn_rx_ring_cnt = 0;
 	sc->hn_rx_ring_inuse = 0;
 }
 
 static int
 hn_create_tx_ring(struct hn_softc *sc, int id)
 {
 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
 	device_t dev = sc->hn_dev;
 	bus_dma_tag_t parent_dtag;
 	int error, i;
 
 	txr->hn_sc = sc;
 	txr->hn_tx_idx = id;
 
 #ifndef HN_USE_TXDESC_BUFRING
 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
 #endif
 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
 
 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
 	    M_NETVSC, M_WAITOK | M_ZERO);
 #ifndef HN_USE_TXDESC_BUFRING
 	SLIST_INIT(&txr->hn_txlist);
 #else
 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC,
 	    M_WAITOK, &txr->hn_tx_lock);
 #endif
 
 	txr->hn_tx_taskq = sc->hn_tx_taskq;
 
 	if (hn_use_if_start) {
 		txr->hn_txeof = hn_start_txeof;
 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
 	} else {
 		int br_depth;
 
 		txr->hn_txeof = hn_xmit_txeof;
 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
 
 		br_depth = hn_get_txswq_depth(txr);
 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC,
 		    M_WAITOK, &txr->hn_tx_lock);
 	}
 
 	txr->hn_direct_tx_size = hn_direct_tx_size;
 
 	/*
 	 * Always schedule transmission instead of trying to do direct
 	 * transmission.  This one gives the best performance so far.
 	 */
 	txr->hn_sched_tx = 1;
 
 	parent_dtag = bus_get_dma_tag(dev);
 
 	/* DMA tag for RNDIS packet messages. */
 	error = bus_dma_tag_create(parent_dtag, /* parent */
 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    HN_RNDIS_PKT_LEN,		/* maxsize */
 	    1,				/* nsegments */
 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
 	    0,				/* flags */
 	    NULL,			/* lockfunc */
 	    NULL,			/* lockfuncarg */
 	    &txr->hn_tx_rndis_dtag);
 	if (error) {
 		device_printf(dev, "failed to create rndis dmatag\n");
 		return error;
 	}
 
 	/* DMA tag for data. */
 	error = bus_dma_tag_create(parent_dtag, /* parent */
 	    1,				/* alignment */
 	    HN_TX_DATA_BOUNDARY,	/* boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
 	    0,				/* flags */
 	    NULL,			/* lockfunc */
 	    NULL,			/* lockfuncarg */
 	    &txr->hn_tx_data_dtag);
 	if (error) {
 		device_printf(dev, "failed to create data dmatag\n");
 		return error;
 	}
 
 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
 
 		txd->txr = txr;
 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
 
 		/*
 		 * Allocate and load RNDIS packet message.
 		 */
         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
 		    (void **)&txd->rndis_pkt,
 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
 		    &txd->rndis_pkt_dmap);
 		if (error) {
 			device_printf(dev,
 			    "failed to allocate rndis_packet_msg, %d\n", i);
 			return error;
 		}
 
 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
 		    txd->rndis_pkt_dmap,
 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
 		    BUS_DMA_NOWAIT);
 		if (error) {
 			device_printf(dev,
 			    "failed to load rndis_packet_msg, %d\n", i);
 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
 			return error;
 		}
 
 		/* DMA map for TX data. */
 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
 		    &txd->data_dmap);
 		if (error) {
 			device_printf(dev,
 			    "failed to allocate tx data dmamap\n");
 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
 			    txd->rndis_pkt_dmap);
 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
 			return error;
 		}
 
 		/* All set, put it to list */
 		txd->flags |= HN_TXD_FLAG_ONLIST;
 #ifndef HN_USE_TXDESC_BUFRING
 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
 #else
 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
 #endif
 	}
 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
 
 	if (sc->hn_tx_sysctl_tree != NULL) {
 		struct sysctl_oid_list *child;
 		struct sysctl_ctx_list *ctx;
 		char name[16];
 
 		/*
 		 * Create per TX ring sysctl tree:
 		 * dev.hn.UNIT.tx.RINGID
 		 */
 		ctx = device_get_sysctl_ctx(dev);
 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
 
 		snprintf(name, sizeof(name), "%d", id);
 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 		if (txr->hn_tx_sysctl_tree != NULL) {
 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
 
 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
 			    "# of available TX descs");
 			if (!hn_use_if_start) {
 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
 				    CTLFLAG_RD, &txr->hn_oactive, 0,
 				    "over active");
 			}
 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
 			    CTLFLAG_RW, &txr->hn_pkts,
 			    "# of packets transmitted");
 		}
 	}
 
 	return 0;
 }
 
 static void
 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
 {
 	struct hn_tx_ring *txr = txd->txr;
 
 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
 
 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
 	    txd->rndis_pkt_dmap);
 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
 }
 
 static void
 hn_destroy_tx_ring(struct hn_tx_ring *txr)
 {
 	struct hn_txdesc *txd;
 
 	if (txr->hn_txdesc == NULL)
 		return;
 
 #ifndef HN_USE_TXDESC_BUFRING
 	while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
 		hn_txdesc_dmamap_destroy(txd);
 	}
 #else
 	mtx_lock(&txr->hn_tx_lock);
 	while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
 		hn_txdesc_dmamap_destroy(txd);
 	mtx_unlock(&txr->hn_tx_lock);
 #endif
 
 	if (txr->hn_tx_data_dtag != NULL)
 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
 	if (txr->hn_tx_rndis_dtag != NULL)
 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
 
 #ifdef HN_USE_TXDESC_BUFRING
 	buf_ring_free(txr->hn_txdesc_br, M_NETVSC);
 #endif
 
 	free(txr->hn_txdesc, M_NETVSC);
 	txr->hn_txdesc = NULL;
 
 	if (txr->hn_mbuf_br != NULL)
 		buf_ring_free(txr->hn_mbuf_br, M_NETVSC);
 
 #ifndef HN_USE_TXDESC_BUFRING
 	mtx_destroy(&txr->hn_txlist_spin);
 #endif
 	mtx_destroy(&txr->hn_tx_lock);
 }
 
 static int
 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
 {
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx;
 	int i;
 
 	/*
 	 * Create TXBUF for chimney sending.
 	 *
 	 * NOTE: It is shared by all channels.
 	 */
 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
 	    PAGE_SIZE, 0, NETVSC_SEND_BUFFER_SIZE, &sc->hn_chim_dma,
 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
 	if (sc->hn_chim == NULL) {
 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
 		return (ENOMEM);
 	}
 
 	sc->hn_tx_ring_cnt = ring_cnt;
 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
 
 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
 	    M_NETVSC, M_WAITOK | M_ZERO);
 
 	ctx = device_get_sysctl_ctx(sc->hn_dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
 
 	/* Create dev.hn.UNIT.tx sysctl tree */
 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 		int error;
 
 		error = hn_create_tx_ring(sc, i);
 		if (error)
 			return error;
 	}
 
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_send_failed),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
 	    "# of total TX descs");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
 	    "Chimney send packet size upper boundary");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
 	    hn_tx_conf_int_sysctl, "I",
 	    "Size of the packet for direct transmission");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
 	    hn_tx_conf_int_sysctl, "I",
 	    "Always schedule transmission "
 	    "instead of doing direct transmission");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
 
 	return 0;
 }
 
 static void
 hn_set_chim_size(struct hn_softc *sc, int chim_size)
 {
 	int i;
 
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
 }
 
 static void
 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
 {
 	struct ifnet *ifp = sc->hn_ifp;
 	int tso_minlen;
 
 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
 		return;
 
 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
 
 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
 
 	if (tso_maxlen < tso_minlen)
 		tso_maxlen = tso_minlen;
 	else if (tso_maxlen > IP_MAXPACKET)
 		tso_maxlen = IP_MAXPACKET;
 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
 		tso_maxlen = sc->hn_ndis_tso_szmax;
 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
 	if (bootverbose)
 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
 }
 
 static void
 hn_fixup_tx_data(struct hn_softc *sc)
 {
 	uint64_t csum_assist;
 	int i;
 
 	hn_set_chim_size(sc, sc->hn_chim_szmax);
 	if (hn_tx_chimney_size > 0 &&
 	    hn_tx_chimney_size < sc->hn_chim_szmax)
 		hn_set_chim_size(sc, hn_tx_chimney_size);
 
 	csum_assist = 0;
 	if (sc->hn_caps & HN_CAP_IPCS)
 		csum_assist |= CSUM_IP;
 	if (sc->hn_caps & HN_CAP_TCP4CS)
 		csum_assist |= CSUM_IP_TCP;
 	if (sc->hn_caps & HN_CAP_UDP4CS)
 		csum_assist |= CSUM_IP_UDP;
 #ifdef notyet
 	if (sc->hn_caps & HN_CAP_TCP6CS)
 		csum_assist |= CSUM_IP6_TCP;
 	if (sc->hn_caps & HN_CAP_UDP6CS)
 		csum_assist |= CSUM_IP6_UDP;
 #endif
 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
 
 	if (sc->hn_caps & HN_CAP_HASHVAL) {
 		/*
 		 * Support HASHVAL pktinfo on TX path.
 		 */
 		if (bootverbose)
 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
 	}
 }
 
 static void
 hn_destroy_tx_data(struct hn_softc *sc)
 {
 	int i;
 
 	if (sc->hn_chim != NULL) {
 		hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
 		sc->hn_chim = NULL;
 	}
 
 	if (sc->hn_tx_ring_cnt == 0)
 		return;
 
 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
 		hn_destroy_tx_ring(&sc->hn_tx_ring[i]);
 
 	free(sc->hn_tx_ring, M_NETVSC);
 	sc->hn_tx_ring = NULL;
 
 	sc->hn_tx_ring_cnt = 0;
 	sc->hn_tx_ring_inuse = 0;
 }
 
 static void
 hn_start_taskfunc(void *xtxr, int pending __unused)
 {
 	struct hn_tx_ring *txr = xtxr;
 
 	mtx_lock(&txr->hn_tx_lock);
 	hn_start_locked(txr, 0);
 	mtx_unlock(&txr->hn_tx_lock);
 }
 
 static void
 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
 {
 	struct hn_tx_ring *txr = xtxr;
 
 	mtx_lock(&txr->hn_tx_lock);
 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
 	hn_start_locked(txr, 0);
 	mtx_unlock(&txr->hn_tx_lock);
 }
 
 static int
 hn_xmit(struct hn_tx_ring *txr, int len)
 {
 	struct hn_softc *sc = txr->hn_sc;
 	struct ifnet *ifp = sc->hn_ifp;
 	struct mbuf *m_head;
 
 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
 	KASSERT(hn_use_if_start == 0,
 	    ("hn_xmit is called, when if_start is enabled"));
 
 	if (__predict_false(txr->hn_suspended))
 		return 0;
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
 		return 0;
 
 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
 		struct hn_txdesc *txd;
 		int error;
 
 		if (len > 0 && m_head->m_pkthdr.len > len) {
 			/*
 			 * This sending could be time consuming; let callers
 			 * dispatch this packet sending (and sending of any
 			 * following up packets) to tx taskqueue.
 			 */
 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
 			return 1;
 		}
 
 		txd = hn_txdesc_get(txr);
 		if (txd == NULL) {
 			txr->hn_no_txdescs++;
 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
 			txr->hn_oactive = 1;
 			break;
 		}
 
 		error = hn_encap(txr, txd, &m_head);
 		if (error) {
 			/* Both txd and m_head are freed; discard */
 			drbr_advance(ifp, txr->hn_mbuf_br);
 			continue;
 		}
 
 		error = hn_send_pkt(ifp, txr, txd);
 		if (__predict_false(error)) {
 			/* txd is freed, but m_head is not */
 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
 			txr->hn_oactive = 1;
 			break;
 		}
 
 		/* Sent */
 		drbr_advance(ifp, txr->hn_mbuf_br);
 	}
 	return 0;
 }
 
 static int
 hn_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct hn_softc *sc = ifp->if_softc;
 	struct hn_tx_ring *txr;
 	int error, idx = 0;
 
 	/*
 	 * Select the TX ring based on flowid
 	 */
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
 	txr = &sc->hn_tx_ring[idx];
 
 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
 	if (error) {
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		return error;
 	}
 
 	if (txr->hn_oactive)
 		return 0;
 
 	if (txr->hn_sched_tx)
 		goto do_sched;
 
 	if (mtx_trylock(&txr->hn_tx_lock)) {
 		int sched;
 
 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
 		mtx_unlock(&txr->hn_tx_lock);
 		if (!sched)
 			return 0;
 	}
 do_sched:
 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
 	return 0;
 }
 
 static void
 hn_tx_ring_qflush(struct hn_tx_ring *txr)
 {
 	struct mbuf *m;
 
 	mtx_lock(&txr->hn_tx_lock);
 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
 		m_freem(m);
 	mtx_unlock(&txr->hn_tx_lock);
 }
 
 static void
 hn_xmit_qflush(struct ifnet *ifp)
 {
 	struct hn_softc *sc = ifp->if_softc;
 	int i;
 
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
 	if_qflush(ifp);
 }
 
 static void
 hn_xmit_txeof(struct hn_tx_ring *txr)
 {
 
 	if (txr->hn_sched_tx)
 		goto do_sched;
 
 	if (mtx_trylock(&txr->hn_tx_lock)) {
 		int sched;
 
 		txr->hn_oactive = 0;
 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
 		mtx_unlock(&txr->hn_tx_lock);
 		if (sched) {
 			taskqueue_enqueue(txr->hn_tx_taskq,
 			    &txr->hn_tx_task);
 		}
 	} else {
 do_sched:
 		/*
 		 * Release the oactive earlier, with the hope, that
 		 * others could catch up.  The task will clear the
 		 * oactive again with the hn_tx_lock to avoid possible
 		 * races.
 		 */
 		txr->hn_oactive = 0;
 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
 	}
 }
 
 static void
 hn_xmit_taskfunc(void *xtxr, int pending __unused)
 {
 	struct hn_tx_ring *txr = xtxr;
 
 	mtx_lock(&txr->hn_tx_lock);
 	hn_xmit(txr, 0);
 	mtx_unlock(&txr->hn_tx_lock);
 }
 
 static void
 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
 {
 	struct hn_tx_ring *txr = xtxr;
 
 	mtx_lock(&txr->hn_tx_lock);
 	txr->hn_oactive = 0;
 	hn_xmit(txr, 0);
 	mtx_unlock(&txr->hn_tx_lock);
 }
 
 static int
 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
 {
 	struct vmbus_chan_br cbr;
 	struct hn_rx_ring *rxr;
 	struct hn_tx_ring *txr = NULL;
 	int idx, error;
 
 	idx = vmbus_chan_subidx(chan);
 
 	/*
 	 * Link this channel to RX/TX ring.
 	 */
 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
 	    ("invalid channel index %d, should > 0 && < %d",
 	     idx, sc->hn_rx_ring_inuse));
 	rxr = &sc->hn_rx_ring[idx];
 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
 	    ("RX ring %d already attached", idx));
 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
 
 	if (bootverbose) {
 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
 		    idx, vmbus_chan_id(chan));
 	}
 
 	if (idx < sc->hn_tx_ring_inuse) {
 		txr = &sc->hn_tx_ring[idx];
 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
 		    ("TX ring %d already attached", idx));
 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
 
 		txr->hn_chan = chan;
 		if (bootverbose) {
 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
 			    idx, vmbus_chan_id(chan));
 		}
 	}
 
 	/* Bind this channel to a proper CPU. */
 	vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
 
 	/*
 	 * Open this channel
 	 */
 	cbr.cbr = rxr->hn_br;
 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
 	cbr.cbr_txsz = NETVSC_DEVICE_RING_BUFFER_SIZE;
 	cbr.cbr_rxsz = NETVSC_DEVICE_RING_BUFFER_SIZE;
 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
 	if (error) {
 		if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
 		    vmbus_chan_id(chan), error);
 		rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
 		if (txr != NULL)
 			txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
 	}
 	return (error);
 }
 
 static void
 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
 {
 	struct hn_rx_ring *rxr;
 	int idx;
 
 	idx = vmbus_chan_subidx(chan);
 
 	/*
 	 * Link this channel to RX/TX ring.
 	 */
 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
 	    ("invalid channel index %d, should > 0 && < %d",
 	     idx, sc->hn_rx_ring_inuse));
 	rxr = &sc->hn_rx_ring[idx];
 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
 	    ("RX ring %d is not attached", idx));
 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
 
 	if (idx < sc->hn_tx_ring_inuse) {
 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
 
 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
 		    ("TX ring %d is not attached attached", idx));
 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
 	}
 
 	/*
 	 * Close this channel.
 	 *
 	 * NOTE:
 	 * Channel closing does _not_ destroy the target channel.
 	 */
 	vmbus_chan_close(chan);
 }
 
 static int
 hn_attach_subchans(struct hn_softc *sc)
 {
 	struct vmbus_channel **subchans;
 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
 	int i, error = 0;
 
 	if (subchan_cnt == 0)
 		return (0);
 
 	/* Attach the sub-channels. */
 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
 	for (i = 0; i < subchan_cnt; ++i) {
 		error = hn_chan_attach(sc, subchans[i]);
 		if (error)
 			break;
 	}
 	vmbus_subchan_rel(subchans, subchan_cnt);
 
 	if (error) {
 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
 	} else {
 		if (bootverbose) {
 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
 			    subchan_cnt);
 		}
 	}
 	return (error);
 }
 
 static void
 hn_detach_allchans(struct hn_softc *sc)
 {
 	struct vmbus_channel **subchans;
 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
 	int i;
 
 	if (subchan_cnt == 0)
 		goto back;
 
 	/* Detach the sub-channels. */
 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
 	for (i = 0; i < subchan_cnt; ++i)
 		hn_chan_detach(sc, subchans[i]);
 	vmbus_subchan_rel(subchans, subchan_cnt);
 
 back:
 	/*
 	 * Detach the primary channel, _after_ all sub-channels
 	 * are detached.
 	 */
 	hn_chan_detach(sc, sc->hn_prichan);
 
 	/* Wait for sub-channels to be destroyed, if any. */
 	vmbus_subchan_drain(sc->hn_prichan);
 
 #ifdef INVARIANTS
 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
 		    HN_RX_FLAG_ATTACHED) == 0,
 		    ("%dth RX ring is still attached", i));
 	}
 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
 		    HN_TX_FLAG_ATTACHED) == 0,
 		    ("%dth TX ring is still attached", i));
 	}
 #endif
 }
 
 static int
 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
 {
 	struct vmbus_channel **subchans;
 	int nchan, rxr_cnt, error;
 
 	nchan = *nsubch + 1;
 	if (nchan == 1) {
 		/*
 		 * Multiple RX/TX rings are not requested.
 		 */
 		*nsubch = 0;
 		return (0);
 	}
 
 	/*
 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
 	 * table entries.
 	 */
 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
 	if (error) {
 		/* No RSS; this is benign. */
 		*nsubch = 0;
 		return (0);
 	}
 	if (bootverbose) {
 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
 		    rxr_cnt, nchan);
 	}
 
 	if (nchan > rxr_cnt)
 		nchan = rxr_cnt;
 	if (nchan == 1) {
 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
 		*nsubch = 0;
 		return (0);
 	}
 
 	/*
 	 * Allocate sub-channels from NVS.
 	 */
 	*nsubch = nchan - 1;
 	error = hn_nvs_alloc_subchans(sc, nsubch);
 	if (error || *nsubch == 0) {
 		/* Failed to allocate sub-channels. */
 		*nsubch = 0;
 		return (0);
 	}
 
 	/*
 	 * Wait for all sub-channels to become ready before moving on.
 	 */
 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
 	vmbus_subchan_rel(subchans, *nsubch);
 	return (0);
 }
 
 static int
 hn_synth_attach(struct hn_softc *sc, int mtu)
 {
 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
 	int error, nsubch, nchan, i;
 	uint32_t old_caps;
 
 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
 	    ("synthetic parts were attached"));
 
 	/* Save capabilities for later verification. */
 	old_caps = sc->hn_caps;
 	sc->hn_caps = 0;
 
 	/*
 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
 	 */
 	error = hn_chan_attach(sc, sc->hn_prichan);
 	if (error)
 		return (error);
 
 	/*
 	 * Attach NVS.
 	 */
 	error = hn_nvs_attach(sc, mtu);
 	if (error)
 		return (error);
 
 	/*
 	 * Attach RNDIS _after_ NVS is attached.
 	 */
 	error = hn_rndis_attach(sc, mtu);
 	if (error)
 		return (error);
 
 	/*
 	 * Make sure capabilities are not changed.
 	 */
 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
 		    old_caps, sc->hn_caps);
 		/* Restore old capabilities and abort. */
 		sc->hn_caps = old_caps;
 		return ENXIO;
 	}
 
 	/*
 	 * Allocate sub-channels for multi-TX/RX rings.
 	 *
 	 * NOTE:
 	 * The # of RX rings that can be used is equivalent to the # of
 	 * channels to be requested.
 	 */
 	nsubch = sc->hn_rx_ring_cnt - 1;
 	error = hn_synth_alloc_subchans(sc, &nsubch);
 	if (error)
 		return (error);
 
 	nchan = nsubch + 1;
 	if (nchan == 1) {
 		/* Only the primary channel can be used; done */
 		goto back;
 	}
 
 	/*
 	 * Configure RSS key and indirect table _after_ all sub-channels
 	 * are allocated.
 	 */
 
 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
 		/*
 		 * RSS key is not set yet; set it to the default RSS key.
 		 */
 		if (bootverbose)
 			if_printf(sc->hn_ifp, "setup default RSS key\n");
 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
 	}
 
 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
 		/*
 		 * RSS indirect table is not set yet; set it up in round-
 		 * robin fashion.
 		 */
 		if (bootverbose) {
 			if_printf(sc->hn_ifp, "setup default RSS indirect "
 			    "table\n");
 		}
 		/* TODO: Take ndis_rss_caps.ndis_nind into account. */
 		for (i = 0; i < NDIS_HASH_INDCNT; ++i)
 			rss->rss_ind[i] = i % nchan;
 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
 	} else {
 		/*
 		 * # of usable channels may be changed, so we have to
 		 * make sure that all entries in RSS indirect table
 		 * are valid.
 		 */
 		hn_rss_ind_fixup(sc, nchan);
 	}
 
 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
 	if (error) {
 		/*
 		 * Failed to configure RSS key or indirect table; only
 		 * the primary channel can be used.
 		 */
 		nchan = 1;
 	}
 back:
 	/*
 	 * Set the # of TX/RX rings that could be used according to
 	 * the # of channels that NVS offered.
 	 */
 	hn_set_ring_inuse(sc, nchan);
 
 	/*
 	 * Attach the sub-channels, if any.
 	 */
 	error = hn_attach_subchans(sc);
 	if (error)
 		return (error);
 
 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
 	return (0);
 }
 
 /*
  * NOTE:
  * The interface must have been suspended though hn_suspend(), before
  * this function get called.
  */
 static void
 hn_synth_detach(struct hn_softc *sc)
 {
 	HN_LOCK_ASSERT(sc);
 
 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
 	    ("synthetic parts were not attached"));
 
 	/* Detach the RNDIS first. */
 	hn_rndis_detach(sc);
 
 	/* Detach NVS. */
 	hn_nvs_detach(sc);
 
 	/* Detach all of the channels. */
 	hn_detach_allchans(sc);
 
 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
 }
 
 static void
 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
 {
 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
 	    ("invalid ring count %d", ring_cnt));
 
 	if (sc->hn_tx_ring_cnt > ring_cnt)
 		sc->hn_tx_ring_inuse = ring_cnt;
 	else
 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
 	sc->hn_rx_ring_inuse = ring_cnt;
 
 	if (bootverbose) {
 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
 	}
 }
 
 static void
 hn_rx_drain(struct vmbus_channel *chan)
 {
 
 	while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
 		pause("waitch", 1);
 	vmbus_chan_intr_drain(chan);
 }
 
 static void
 hn_suspend_data(struct hn_softc *sc)
 {
 	struct vmbus_channel **subch = NULL;
 	int i, nsubch;
 
 	HN_LOCK_ASSERT(sc);
 
 	/*
 	 * Suspend TX.
 	 */
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
 
 		mtx_lock(&txr->hn_tx_lock);
 		txr->hn_suspended = 1;
 		mtx_unlock(&txr->hn_tx_lock);
 		/* No one is able send more packets now. */
 
 		/* Wait for all pending sends to finish. */
 		while (hn_tx_ring_pending(txr))
 			pause("hnwtx", 1 /* 1 tick */);
 
 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
 	}
 
 	/*
 	 * Disable RX by clearing RX filter.
 	 */
 	hn_rndis_set_rxfilter(sc, 0);
 	sc->hn_rx_filter = 0;
 
 	/*
 	 * Give RNDIS enough time to flush all pending data packets.
 	 */
 	pause("waitrx", (200 * hz) / 1000);
 
 	/*
 	 * Drain RX/TX bufrings and interrupts.
 	 */
 	nsubch = sc->hn_rx_ring_inuse - 1;
 	if (nsubch > 0)
 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
 
 	if (subch != NULL) {
 		for (i = 0; i < nsubch; ++i)
 			hn_rx_drain(subch[i]);
 	}
 	hn_rx_drain(sc->hn_prichan);
 
 	if (subch != NULL)
 		vmbus_subchan_rel(subch, nsubch);
 }
 
 static void
 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
 {
 
 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
 }
 
 static void
 hn_suspend_mgmt(struct hn_softc *sc)
 {
 	struct task task;
 
 	HN_LOCK_ASSERT(sc);
 
 	/*
 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
 	 * through hn_mgmt_taskq.
 	 */
 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
 	vmbus_chan_run_task(sc->hn_prichan, &task);
 
 	/*
 	 * Make sure that all pending management tasks are completed.
 	 */
 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
 }
 
 static void
 hn_suspend(struct hn_softc *sc)
 {
 
 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
 		hn_suspend_data(sc);
 	hn_suspend_mgmt(sc);
 }
 
 static void
 hn_tx_resume(struct hn_softc *sc, int tx_ring_cnt)
 {
 	int i;
 
 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
 	    ("invalid TX ring count %d", tx_ring_cnt));
 
 	for (i = 0; i < tx_ring_cnt; ++i) {
 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
 
 		mtx_lock(&txr->hn_tx_lock);
 		txr->hn_suspended = 0;
 		mtx_unlock(&txr->hn_tx_lock);
 	}
 }
 
 static void
 hn_resume_data(struct hn_softc *sc)
 {
 	int i;
 
 	HN_LOCK_ASSERT(sc);
 
 	/*
 	 * Re-enable RX.
 	 */
 	hn_set_rxfilter(sc);
 
 	/*
 	 * Make sure to clear suspend status on "all" TX rings,
 	 * since hn_tx_ring_inuse can be changed after
 	 * hn_suspend_data().
 	 */
 	hn_tx_resume(sc, sc->hn_tx_ring_cnt);
 
 	if (!hn_use_if_start) {
 		/*
 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
 		 * reduced.
 		 */
 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
 	}
 
 	/*
 	 * Kick start TX.
 	 */
 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
 
 		/*
 		 * Use txeof task, so that any pending oactive can be
 		 * cleared properly.
 		 */
 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
 	}
 }
 
 static void
 hn_resume_mgmt(struct hn_softc *sc)
 {
 
 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
 
 	/*
 	 * Kick off network change detection, if it was pending.
 	 * If no network change was pending, start link status
 	 * checks, which is more lightweight than network change
 	 * detection.
 	 */
 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
 		hn_network_change(sc);
 	else
 		hn_link_status_update(sc);
 }
 
 static void
 hn_resume(struct hn_softc *sc)
 {
 
 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
 		hn_resume_data(sc);
 	hn_resume_mgmt(sc);
 }
 
 static void
 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
 {
 	const struct hn_nvs_hdr *hdr;
 
 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
 		return;
 	}
 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
 
 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
 		/* Useless; ignore */
 		return;
 	}
 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
 }
 
 static void
 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
     const struct vmbus_chanpkt_hdr *pkt)
 {
 	struct hn_send_ctx *sndc;
 
 	sndc = (struct hn_send_ctx *)(uintptr_t)pkt->cph_xactid;
 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
 	    VMBUS_CHANPKT_DATALEN(pkt));
 	/*
 	 * NOTE:
 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
 	 * its callback.
 	 */
 }
 
 static void
 hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
     struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr)
 {
 	const struct vmbus_chanpkt_rxbuf *pkt;
 	const struct hn_nvs_hdr *nvs_hdr;
 	int count, i, hlen;
 
 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
 		return;
 	}
 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
 
 	/* Make sure that this is a RNDIS message. */
 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
 		    nvs_hdr->nvs_type);
 		return;
 	}
 
 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
 	if (__predict_false(hlen < sizeof(*pkt))) {
 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
 		return;
 	}
 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
 
 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
 		    pkt->cp_rxbuf_id);
 		return;
 	}
 
 	count = pkt->cp_rxbuf_cnt;
 	if (__predict_false(hlen <
 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
 		return;
 	}
 
 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
 	for (i = 0; i < count; ++i) {
 		int ofs, len;
 
 		ofs = pkt->cp_rxbuf[i].rb_ofs;
 		len = pkt->cp_rxbuf[i].rb_len;
 		if (__predict_false(ofs + len > NETVSC_RECEIVE_BUFFER_SIZE)) {
 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
 			    "ofs %d, len %d\n", i, ofs, len);
 			continue;
 		}
 		hv_rf_on_receive(sc, rxr, rxr->hn_rxbuf + ofs, len);
 	}
 	
 	/*
 	 * Moved completion call back here so that all received 
 	 * messages (not just data messages) will trigger a response
 	 * message back to the host.
 	 */
 	hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid);
 }
 
 /*
  * Net VSC on receive completion
  *
  * Send a receive completion packet to RNDIS device (ie NetVsp)
  */
 static void
 hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid)
 {
 	struct hn_nvs_rndis_ack ack;
 	int retries = 0;
 	int ret = 0;
 	
 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
 	ack.nvs_status = HN_NVS_STATUS_OK;
 
 retry_send_cmplt:
 	/* Send the completion */
 	ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
 	if (ret == 0) {
 		/* success */
 		/* no-op */
 	} else if (ret == EAGAIN) {
 		/* no more room... wait a bit and attempt to retry 3 times */
 		retries++;
 
 		if (retries < 4) {
 			DELAY(100);
 			goto retry_send_cmplt;
 		}
 	}
 }
 
 static void
 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
 {
 	struct hn_rx_ring *rxr = xrxr;
 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
 	void *buffer;
 	int bufferlen = NETVSC_PACKET_SIZE;
 
 	buffer = rxr->hn_rdbuf;
 	do {
 		struct vmbus_chanpkt_hdr *pkt = buffer;
 		uint32_t bytes_rxed;
 		int ret;
 
 		bytes_rxed = bufferlen;
 		ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
 		if (ret == 0) {
 			switch (pkt->cph_type) {
 			case VMBUS_CHANPKT_TYPE_COMP:
 				hn_nvs_handle_comp(sc, chan, pkt);
 				break;
 			case VMBUS_CHANPKT_TYPE_RXBUF:
 				hn_nvs_handle_rxbuf(sc, rxr, chan, pkt);
 				break;
 			case VMBUS_CHANPKT_TYPE_INBAND:
 				hn_nvs_handle_notify(sc, pkt);
 				break;
 			default:
 				if_printf(rxr->hn_ifp,
 				    "unknown chan pkt %u\n",
 				    pkt->cph_type);
 				break;
 			}
 		} else if (ret == ENOBUFS) {
 			/* Handle large packet */
 			if (bufferlen > NETVSC_PACKET_SIZE) {
 				free(buffer, M_NETVSC);
 				buffer = NULL;
 			}
 
 			/* alloc new buffer */
 			buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT);
 			if (buffer == NULL) {
 				if_printf(rxr->hn_ifp,
 				    "hv_cb malloc buffer failed, len=%u\n",
 				    bytes_rxed);
 				bufferlen = 0;
 				break;
 			}
 			bufferlen = bytes_rxed;
 		} else {
 			/* No more packets */
 			break;
 		}
 	} while (1);
 
 	if (bufferlen > NETVSC_PACKET_SIZE)
 		free(buffer, M_NETVSC);
 
 	hv_rf_channel_rollup(rxr, rxr->hn_txr);
 }
 
 static void
 hn_tx_taskq_create(void *arg __unused)
 {
 	if (!hn_share_tx_taskq)
 		return;
 
 	hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
 	    taskqueue_thread_enqueue, &hn_tx_taskq);
 	if (hn_bind_tx_taskq >= 0) {
 		int cpu = hn_bind_tx_taskq;
 		cpuset_t cpu_set;
 
 		if (cpu > mp_ncpus - 1)
 			cpu = mp_ncpus - 1;
 		CPU_SETOF(cpu, &cpu_set);
 		taskqueue_start_threads_cpuset(&hn_tx_taskq, 1, PI_NET,
 		    &cpu_set, "hn tx");
 	} else {
 		taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
 	}
 }
 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
     hn_tx_taskq_create, NULL);
 
 static void
 hn_tx_taskq_destroy(void *arg __unused)
 {
 	if (hn_tx_taskq != NULL)
 		taskqueue_free(hn_tx_taskq);
 }
 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
     hn_tx_taskq_destroy, NULL);
 
 static device_method_t netvsc_methods[] = {
         /* Device interface */
         DEVMETHOD(device_probe,         netvsc_probe),
         DEVMETHOD(device_attach,        netvsc_attach),
         DEVMETHOD(device_detach,        netvsc_detach),
         DEVMETHOD(device_shutdown,      netvsc_shutdown),
 
         { 0, 0 }
 };
 
 static driver_t netvsc_driver = {
         NETVSC_DEVNAME,
         netvsc_methods,
         sizeof(struct hn_softc)
 };
 
 static devclass_t netvsc_devclass;
 
 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
 MODULE_VERSION(hn, 1);
 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
Index: user/alc/PQ_LAUNDRY/sys/i386/i386/trap.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/i386/i386/trap.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/i386/i386/trap.c	(revision 307896)
@@ -1,1133 +1,1123 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_npx.h"
 #include "opt_stack.h"
 #include "opt_trap.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , page_fault, all);
 PMC_SOFT_DEFINE( , , page_fault, read);
 PMC_SOFT_DEFINE( , , page_fault, write);
 #endif
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/stack.h>
 #include <machine/tss.h>
 #include <machine/vm86.h>
 
 #ifdef POWERFAIL_NMI
 #include <sys/syslog.h>
 #include <machine/clock.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 #endif
 
 extern void trap(struct trapframe *frame);
 extern void syscall(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int, vm_offset_t);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 void dblfault_handler(void);
 
 extern inthand_t IDTVEC(lcall_syscall);
 
 #define MAX_TRAP_MSG		32
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 	"",					/* 31 unused (reserved) */
 	"DTrace pid return trap",               /* 32 T_DTRACE_RET */
 };
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 int has_f00f_bug = 0;		/* Initialized so that it can be patched. */
 #endif
 
-#ifdef KDB
-static int kdb_on_nmi = 1;
-SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
-	&kdb_on_nmi, 0, "Go to KDB on NMI");
-#endif
-static int panic_on_nmi = 1;
-SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
-	&panic_on_nmi, 0, "Panic on NMI");
 static int prot_fault_translation = 0;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
 	&prot_fault_translation, 0, "Select signal to deliver on protection fault");
 static int uprintf_signal;
 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
     &uprintf_signal, 0,
     "Print debugging information on trap signal to ctty");
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 #ifdef KDTRACE_HOOKS
 	struct reg regs;
 #endif
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 #ifdef KDB
 	register_t dr6;
 #endif
 	int i = 0, ucode = 0;
 	u_int type;
 	register_t addr = 0;
 	vm_offset_t eva;
 	ksiginfo_t ksi;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
 	PCPU_INC(cnt.v_trap);
 	type = frame->tf_trapno;
 
 #ifdef SMP
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI) {
 	         if (ipi_nmi_handler() == 0)
 	                   goto out;
 	}
 #endif /* SMP */
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		goto out;
 	}
 #endif
 
 	if (type == T_RESERVED) {
 		trap_fatal(frame, 0);
 		goto out;
 	}
 
 	if (type == T_NMI) {
 #ifdef HWPMC_HOOKS
 		/*
 		 * CPU PMCs interrupt using an NMI so we check for that first.
 		 * If the HWPMC module is active, 'pmc_hook' will point to
 		 * the function to be called.  A non-zero return value from the
 		 * hook means that the NMI was consumed by it and that we can
 		 * return immediately.
 		 */
 		if (pmc_intr != NULL &&
 		    (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
 			goto out;
 #endif
 
 #ifdef STACK
 		if (stack_nmi_handler(frame) != 0)
 			goto out;
 #endif
 	}
 
 	if (type == T_MCHK) {
 		mca_intr();
 		goto out;
 	}
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
 	 * a flag in its per-cpu flags to indicate that it doesn't
 	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
 	 */
 	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
 	    dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
 		goto out;
 #endif
 
 	if ((frame->tf_eflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (TRAPF_USERMODE(frame) &&
 		    (curpcb->pcb_flags & PCB_VM86CALL) == 0)
 			uprintf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curthread->td_name, type);
 		else if (type != T_NMI && type != T_BPTFLT &&
 		    type != T_TRCTRAP &&
 		    frame->tf_eip != (int)cpu_switch_load_gs) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * Page faults need interrupts disabled until later,
 			 * and we shouldn't enable interrupts while holding
 			 * a spin lock.
 			 */
 			if (type != T_PAGEFLT &&
 			    td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 	eva = 0;
 	if (type == T_PAGEFLT) {
 		/*
 		 * For some Cyrix CPUs, %cr2 is clobbered by
 		 * interrupts.  This problem is worked around by using
 		 * an interrupt gate for the pagefault handler.  We
 		 * are finally ready to read %cr2 and conditionally
 		 * reenable interrupts.  If we hold a spin lock, then
 		 * we must not reenable interrupts.  This might be a
 		 * spurious page fault.
 		 */
 		eva = rcr2();
 		if (td->td_md.md_spinlock_count == 0)
 			enable_intr();
 	}
 
         if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_eip;
 		if (td->td_cowgen != p->p_cowgen)
 			thread_cow_update(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			i = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
 #ifdef KDTRACE_HOOKS
 			if (type == T_BPTFLT) {
 				fill_frame_regs(frame, &regs);
 				if (dtrace_pid_probe_ptr != NULL &&
 				    dtrace_pid_probe_ptr(&regs) == 0)
 					goto out;
 			}
 #endif
 user_trctrap_out:
 			frame->tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 #ifdef DEV_NPX
 			ucode = npxtrap_x87();
 			if (ucode == -1)
 				goto userout;
 #else
 			ucode = 0;
 #endif
 			i = SIGFPE;
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame->tf_eflags & PSL_VM) {
 				i = vm86_emulate((struct vm86frame *)frame);
 				if (i == SIGTRAP) {
 					type = T_TRCTRAP;
 					load_dr6(rdr6() | 0x4000);
 					goto user_trctrap_out;
 				}
 				if (i == 0)
 					goto user;
 				break;
 			}
 			i = SIGBUS;
 			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
 			break;
 		case T_SEGNPFLT:	/* segment not present fault */
 			i = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_ALIGNFLT:
 			i = SIGBUS;
 			ucode = BUS_ADRALN;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 
 			i = trap_pfault(frame, TRUE, eva);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (i == -2) {
 				/*
 				 * The f00f hack workaround has triggered, so
 				 * treat the fault as an illegal instruction 
 				 * (T_PRIVINFLT) instead of a page fault.
 				 */
 				type = frame->tf_trapno = T_PRIVINFLT;
 
 				/* Proceed as in that case. */
 				ucode = ILL_PRVOPC;
 				i = SIGILL;
 				break;
 			}
 #endif
 			if (i == -1)
 				goto userout;
 			if (i == 0)
 				goto user;
 
 			if (i == SIGSEGV)
 				ucode = SEGV_MAPERR;
 			else {
 				if (prot_fault_translation == 0) {
 					/*
 					 * Autodetect.
 					 * This check also covers the images
 					 * without the ABI-tag ELF note.
 					 */
 					if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
 					    && p->p_osrel >= P_OSREL_SIGSEGV) {
 						i = SIGSEGV;
 						ucode = SEGV_ACCERR;
 					} else {
 						i = SIGBUS;
 						ucode = BUS_PAGE_FAULT;
 					}
 				} else if (prot_fault_translation == 1) {
 					/*
 					 * Always compat mode.
 					 */
 					i = SIGBUS;
 					ucode = BUS_PAGE_FAULT;
 				} else {
 					/*
 					 * Always SIGSEGV mode.
 					 */
 					i = SIGSEGV;
 					ucode = SEGV_ACCERR;
 				}
 			}
 			addr = eva;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(880, hz);
 				lastalert = time_second;
 			}
 			goto userout;
 #else /* !POWERFAIL_NMI */
-			nmi_handle_intr(type, frame, true);
+			nmi_handle_intr(type, frame);
 			break;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			KASSERT(PCB_USER_FPU(td->td_pcb),
 			    ("kernel FPU ctx has leaked"));
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				goto userout;
 #endif
 			uprintf("pid %d killed due to lack of floating point\n",
 				p->p_pid);
 			i = SIGKILL;
 			ucode = 0;
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			i = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 #if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 			ucode = npxtrap_sse();
 			if (ucode == -1)
 				goto userout;
 #else
 			ucode = 0;
 #endif
 			i = SIGFPE;
 			break;
 #ifdef KDTRACE_HOOKS
 		case T_DTRACE_RET:
 			enable_intr();
 			fill_frame_regs(frame, &regs);
 			if (dtrace_return_probe_ptr != NULL &&
 			    dtrace_return_probe_ptr(&regs) == 0)
 				goto out;
 			break;
 #endif
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE, eva);
 			goto out;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			if (PCB_USER_FPU(td->td_pcb))
 				panic("Unregistered use of FPU in kernel");
 			if (npxdna())
 				goto out;
 #endif
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 		case T_XMMFLT:		/* SIMD floating-point exception */
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			/*
 			 * XXXKIB for now disable any FPU traps in kernel
 			 * handler registration seems to be overkill
 			 */
 			trap_fatal(frame, 0);
 			goto out;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame->tf_eflags & PSL_VM) {
 				i = vm86_emulate((struct vm86frame *)frame);
 				if (i == SIGTRAP) {
 					type = T_TRCTRAP;
 					load_dr6(rdr6() | 0x4000);
 					goto kernel_trctrap;
 				}
 				if (i != 0)
 					/*
 					 * returns to original process
 					 */
 					vm86_trap((struct vm86frame *)frame);
 				goto out;
 			}
 			if (type == T_STKFLT)
 				break;
 
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (curpcb->pcb_flags & PCB_VM86CALL)
 				break;
 
 			/*
 			 * Invalid %fs's and %gs's can be created using
 			 * procfs or PT_SETREGS or by invalidating the
 			 * underlying LDT entry.  This causes a fault
 			 * in kernel mode when the kernel attempts to
 			 * switch contexts.  Lose the bad context
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
 			if (frame->tf_eip == (int)cpu_switch_load_gs) {
 				curpcb->pcb_gs = 0;
 #if 0				
 				PROC_LOCK(p);
 				kern_psignal(p, SIGBUS);
 				PROC_UNLOCK(p);
 #endif				
 				goto out;
 			}
 
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame->tf_eip == (int)doreti_iret) {
 				frame->tf_eip = (int)doreti_iret_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_ds) {
 				frame->tf_eip = (int)doreti_popl_ds_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_es) {
 				frame->tf_eip = (int)doreti_popl_es_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_fs) {
 				frame->tf_eip = (int)doreti_popl_fs_fault;
 				goto out;
 			}
 			if (curpcb->pcb_onfault != NULL) {
 				frame->tf_eip =
 				    (int)curpcb->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_eflags & PSL_NT) {
 				frame->tf_eflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 kernel_trctrap:
 			if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				goto out;
 			}
 			if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame->tf_eflags &= ~PSL_T;
 				goto out;
 			}
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			if (user_dbreg_trap() && 
 			   !(curpcb->pcb_flags & PCB_VM86CALL)) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & ~0xf);
 				goto out;
 			}
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			/* XXX %dr6 is not quite reentrant. */
 			dr6 = rdr6();
 			load_dr6(dr6 & ~0x4000);
 			if (kdb_trap(type, dr6, frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(880, hz);
 				lastalert = time_second;
 			}
 			goto out;
 #else /* !POWERFAIL_NMI */
-			if (nmi_handle_intr(type, frame, false) ||
-			    !panic_on_nmi)
-				goto out;
-			/* FALLTHROUGH */
+			nmi_handle_intr(type, frame);
+			goto out;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(frame, eva);
 		goto out;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = i;
 	ksi.ksi_code = ucode;
 	ksi.ksi_addr = (void *)addr;
 	ksi.ksi_trapno = type;
 	if (uprintf_signal) {
 		uprintf("pid %d comm %s: signal %d err %x code %d type %d "
 		    "addr 0x%x esp 0x%08x eip 0x%08x "
 		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
 		    p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
 		    frame->tf_esp, frame->tf_eip,
 		    fubyte((void *)(frame->tf_eip + 0)),
 		    fubyte((void *)(frame->tf_eip + 1)),
 		    fubyte((void *)(frame->tf_eip + 2)),
 		    fubyte((void *)(frame->tf_eip + 3)),
 		    fubyte((void *)(frame->tf_eip + 4)),
 		    fubyte((void *)(frame->tf_eip + 5)),
 		    fubyte((void *)(frame->tf_eip + 6)),
 		    fubyte((void *)(frame->tf_eip + 7)));
 	}
 	KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
 	trapsignal(td, &ksi);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", (u_long)eva);
 		uprintf("\n");
 	}
 #endif
 
 user:
 	userret(td, frame);
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("Return from trap with kernel FPU ctx leaked"));
 userout:
 out:
 	return;
 }
 
 static int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
 		/*
 		 * Due to both processor errata and lazy TLB invalidation when
 		 * access restrictions are removed from virtual pages, memory
 		 * accesses that are allowed by the physical mapping layer may
 		 * nonetheless cause one spurious page fault per virtual page. 
 		 * When the thread is executing a "no faulting" section that
 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
 		 * every page fault is treated as a spurious page fault,
 		 * unless it accesses the same virtual address as the most
 		 * recent page fault within the same "no faulting" section.
 		 */
 		if (td->td_md.md_spurflt_addr != eva ||
 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
 			/*
 			 * Do nothing to the TLB.  A stale TLB entry is
 			 * flushed automatically by a page fault.
 			 */
 			td->td_md.md_spurflt_addr = eva;
 			td->td_pflags &= ~TDP_RESETSPUR;
 			return (0);
 		}
 	} else {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
 		 * F00F bug workaround was triggered, and we need to
 		 * treat it is as an illegal instruction, and not a page
 		 * fault.
 		 */
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
 			return (-2);
 #endif
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		map = &p->p_vmspace->vm_map;
 
 		/*
 		 * When accessing a user-space address, kernel must be
 		 * ready to accept the page fault, and provide a
 		 * handling routine.  Since accessing the address
 		 * without the handler is a bug, do not try to handle
 		 * it normally, and panic immediately.
 		 */
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
 		    curpcb->pcb_onfault == NULL)) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 #if defined(PAE) || defined(PAE_TABLES)
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 #endif
 	else
 		ftype = VM_PROT_READ;
 
 	/* Fault in the page. */
 	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	if (rv == KERN_SUCCESS) {
 #ifdef HWPMC_HOOKS
 		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
 			if (ftype == VM_PROT_READ)
 				PMC_SOFT_CALL_TF( , , page_fault, read,
 				    frame);
 			else
 				PMC_SOFT_CALL_TF( , , page_fault, write,
 				    frame);
 		}
 #endif
 		return (0);
 	}
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    curpcb->pcb_onfault != NULL) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, ss, esp;
 	u_int type;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		msg = trap_msg[type];
 	else
 		msg = "UNKNOWN";
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
 	    frame->tf_eflags & PSL_VM ? "vm86" :
 	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n",
 	       frame->tf_cs & 0xffff, frame->tf_eip);
         if (TF_HAS_STACKREGS(frame)) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_esp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (int)&frame->tf_esp;
 	}
 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= %d (%s)\n",
 	    curproc->p_pid, curthread->td_name);
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active) {
 		frame->tf_err = eva;	/* smuggle fault address to ddb */
 		if (kdb_trap(type, 0, frame)) {
 			frame->tf_err = code;	/* restore error code */
 			return;
 		}
 		frame->tf_err = code;		/* restore error code */
 	}
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 #ifdef KDTRACE_HOOKS
 	if (dtrace_doubletrap_func != NULL)
 		(*dtrace_doubletrap_func)();
 #endif
 	printf("\nFatal double fault:\n");
 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 int
 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	caddr_t params;
 	long tmp;
 	int error;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 
 	params = (caddr_t)frame->tf_esp + sizeof(int);
 	sa->code = frame->tf_eax;
 
 	/*
 	 * Need to check if this is a 32 bit or 64 bit syscall.
 	 */
 	if (sa->code == SYS_syscall) {
 		/*
 		 * Code is first argument, followed by actual args.
 		 */
 		error = fueword(params, &tmp);
 		if (error == -1)
 			return (EFAULT);
 		sa->code = tmp;
 		params += sizeof(int);
 	} else if (sa->code == SYS___syscall) {
 		/*
 		 * Like syscall, but code is a quad, so as to maintain
 		 * quad alignment for the rest of the arguments.
 		 */
 		error = fueword(params, &tmp);
 		if (error == -1)
 			return (EFAULT);
 		sa->code = tmp;
 		params += sizeof(quad_t);
 	}
 
  	if (p->p_sysent->sv_mask)
  		sa->code &= p->p_sysent->sv_mask;
  	if (sa->code >= p->p_sysent->sv_size)
  		sa->callp = &p->p_sysent->sv_table[0];
   	else
  		sa->callp = &p->p_sysent->sv_table[sa->code];
 	sa->narg = sa->callp->sy_narg;
 
 	if (params != NULL && sa->narg != 0)
 		error = copyin(params, (caddr_t)sa->args,
 		    (u_int)(sa->narg * sizeof(int)));
 	else
 		error = 0;
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_edx;
 	}
 		
 	return (error);
 }
 
 #include "../../kern/subr_syscall.c"
 
 /*
  * syscall - system call request C handler.  A system call is
  * essentially treated as a trap by reusing the frame layout.
  */
 void
 syscall(struct trapframe *frame)
 {
 	struct thread *td;
 	struct syscall_args sa;
 	register_t orig_tf_eflags;
 	int error;
 	ksiginfo_t ksi;
 
 #ifdef DIAGNOSTIC
 	if (!(TRAPF_USERMODE(frame) &&
 	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 	orig_tf_eflags = frame->tf_eflags;
 
 	td = curthread;
 	td->td_frame = frame;
 
 	error = syscallenter(td, &sa);
 
 	/*
 	 * Traced syscall.
 	 */
 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
 		frame->tf_eflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)frame->tf_eip;
 		trapsignal(td, &ksi);
 	}
 
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("System call %s returning with kernel FPU ctx leaked",
 	     syscallname(td->td_proc, sa.code)));
 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 	    ("System call %s returning with mangled pcb_save",
 	     syscallname(td->td_proc, sa.code)));
 
 	syscallret(td, error, &sa);
 }
Index: user/alc/PQ_LAUNDRY/sys/kern/subr_prf.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/subr_prf.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/kern/subr_prf.c	(revision 307896)
@@ -1,1219 +1,1223 @@
 /*-
  * Copyright (c) 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include "opt_ddb.h"
 #include "opt_printf.h"
 #endif  /* _KERNEL */
 
 #include <sys/param.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kdb.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/kernel.h>
 #include <sys/msgbuf.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stddef.h>
 #include <sys/sysctl.h>
 #include <sys/tty.h>
 #include <sys/syslog.h>
 #include <sys/cons.h>
 #include <sys/uio.h>
 #endif
 #include <sys/ctype.h>
 #include <sys/sbuf.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
+#ifdef _KERNEL
 #include <machine/stdarg.h>
+#else
+#include <stdarg.h>
+#endif
 
 #ifdef _KERNEL
 
 #define TOCONS	0x01
 #define TOTTY	0x02
 #define TOLOG	0x04
 
 /* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
 #define MAXNBUF	(sizeof(intmax_t) * NBBY + 1)
 
 struct putchar_arg {
 	int	flags;
 	int	pri;
 	struct	tty *tty;
 	char	*p_bufr;
 	size_t	n_bufr;
 	char	*p_next;
 	size_t	remain;
 };
 
 struct snprintf_arg {
 	char	*str;
 	size_t	remain;
 };
 
 extern	int log_open;
 
 static void  msglogchar(int c, int pri);
 static void  msglogstr(char *str, int pri, int filter_cr);
 static void  putchar(int ch, void *arg);
 static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper);
 static void  snprintf_func(int ch, void *arg);
 
 static int msgbufmapped;		/* Set when safe to use msgbuf */
 int msgbuftrigger;
 
 static int log_console_output = 1;
 SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RWTUN,
     &log_console_output, 0, "Duplicate console output to the syslog");
 
 /*
  * See the comment in log_console() below for more explanation of this.
  */
 static int log_console_add_linefeed;
 SYSCTL_INT(_kern, OID_AUTO, log_console_add_linefeed, CTLFLAG_RWTUN,
     &log_console_add_linefeed, 0, "log_console() adds extra newlines");
 
 static int always_console_output;
 SYSCTL_INT(_kern, OID_AUTO, always_console_output, CTLFLAG_RWTUN,
     &always_console_output, 0, "Always output to console despite TIOCCONS");
 
 /*
  * Warn that a system table is full.
  */
 void
 tablefull(const char *tab)
 {
 
 	log(LOG_ERR, "%s: table is full\n", tab);
 }
 
 /*
  * Uprintf prints to the controlling terminal for the current process.
  */
 int
 uprintf(const char *fmt, ...)
 {
 	va_list ap;
 	struct putchar_arg pca;
 	struct proc *p;
 	struct thread *td;
 	int retval;
 
 	td = curthread;
 	if (TD_IS_IDLETHREAD(td))
 		return (0);
 
 	sx_slock(&proctree_lock);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if ((p->p_flag & P_CONTROLT) == 0) {
 		PROC_UNLOCK(p);
 		sx_sunlock(&proctree_lock);
 		return (0);
 	}
 	SESS_LOCK(p->p_session);
 	pca.tty = p->p_session->s_ttyp;
 	SESS_UNLOCK(p->p_session);
 	PROC_UNLOCK(p);
 	if (pca.tty == NULL) {
 		sx_sunlock(&proctree_lock);
 		return (0);
 	}
 	pca.flags = TOTTY;
 	pca.p_bufr = NULL;
 	va_start(ap, fmt);
 	tty_lock(pca.tty);
 	sx_sunlock(&proctree_lock);
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 	tty_unlock(pca.tty);
 	va_end(ap);
 	return (retval);
 }
 
 /*
  * tprintf and vtprintf print on the controlling terminal associated with the
  * given session, possibly to the log as well.
  */
 void
 tprintf(struct proc *p, int pri, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vtprintf(p, pri, fmt, ap);
 	va_end(ap);
 }
 
 void
 vtprintf(struct proc *p, int pri, const char *fmt, va_list ap)
 {
 	struct tty *tp = NULL;
 	int flags = 0;
 	struct putchar_arg pca;
 	struct session *sess = NULL;
 
 	sx_slock(&proctree_lock);
 	if (pri != -1)
 		flags |= TOLOG;
 	if (p != NULL) {
 		PROC_LOCK(p);
 		if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
 			sess = p->p_session;
 			sess_hold(sess);
 			PROC_UNLOCK(p);
 			tp = sess->s_ttyp;
 			if (tp != NULL && tty_checkoutq(tp))
 				flags |= TOTTY;
 			else
 				tp = NULL;
 		} else
 			PROC_UNLOCK(p);
 	}
 	pca.pri = pri;
 	pca.tty = tp;
 	pca.flags = flags;
 	pca.p_bufr = NULL;
 	if (pca.tty != NULL)
 		tty_lock(pca.tty);
 	sx_sunlock(&proctree_lock);
 	kvprintf(fmt, putchar, &pca, 10, ap);
 	if (pca.tty != NULL)
 		tty_unlock(pca.tty);
 	if (sess != NULL)
 		sess_release(sess);
 	msgbuftrigger = 1;
 }
 
 /*
  * Ttyprintf displays a message on a tty; it should be used only by
  * the tty driver, or anything that knows the underlying tty will not
  * be revoke(2)'d away.  Other callers should use tprintf.
  */
 int
 ttyprintf(struct tty *tp, const char *fmt, ...)
 {
 	va_list ap;
 	struct putchar_arg pca;
 	int retval;
 
 	va_start(ap, fmt);
 	pca.tty = tp;
 	pca.flags = TOTTY;
 	pca.p_bufr = NULL;
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 	va_end(ap);
 	return (retval);
 }
 
 static int
 _vprintf(int level, int flags, const char *fmt, va_list ap)
 {
 	struct putchar_arg pca;
 	int retval;
 #ifdef PRINTF_BUFR_SIZE
 	char bufr[PRINTF_BUFR_SIZE];
 #endif
 
 	pca.tty = NULL;
 	pca.pri = level;
 	pca.flags = flags;
 #ifdef PRINTF_BUFR_SIZE
 	pca.p_bufr = bufr;
 	pca.p_next = pca.p_bufr;
 	pca.n_bufr = sizeof(bufr);
 	pca.remain = sizeof(bufr);
 	*pca.p_next = '\0';
 #else
 	/* Don't buffer console output. */
 	pca.p_bufr = NULL;
 #endif
 
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 
 #ifdef PRINTF_BUFR_SIZE
 	/* Write any buffered console/log output: */
 	if (*pca.p_bufr != '\0') {
 		if (pca.flags & TOLOG)
 			msglogstr(pca.p_bufr, level, /*filter_cr*/1);
 
 		if (pca.flags & TOCONS)
 			cnputs(pca.p_bufr);
 	}
 #endif
 
 	return (retval);
 }
 
 /*
  * Log writes to the log buffer, and guarantees not to sleep (so can be
  * called by interrupt routines).  If there is no process reading the
  * log yet, it writes to the console also.
  */
 void
 log(int level, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vlog(level, fmt, ap);
 	va_end(ap);
 }
 
 void
 vlog(int level, const char *fmt, va_list ap)
 {
 
 	(void)_vprintf(level, log_open ? TOLOG : TOCONS | TOLOG, fmt, ap);
 	msgbuftrigger = 1;
 }
 
 #define CONSCHUNK 128
 
 void
 log_console(struct uio *uio)
 {
 	int c, error, nl;
 	char *consbuffer;
 	int pri;
 
 	if (!log_console_output)
 		return;
 
 	pri = LOG_INFO | LOG_CONSOLE;
 	uio = cloneuio(uio);
 	consbuffer = malloc(CONSCHUNK, M_TEMP, M_WAITOK);
 
 	nl = 0;
 	while (uio->uio_resid > 0) {
 		c = imin(uio->uio_resid, CONSCHUNK - 1);
 		error = uiomove(consbuffer, c, uio);
 		if (error != 0)
 			break;
 		/* Make sure we're NUL-terminated */
 		consbuffer[c] = '\0';
 		if (consbuffer[c - 1] == '\n')
 			nl = 1;
 		else
 			nl = 0;
 		msglogstr(consbuffer, pri, /*filter_cr*/ 1);
 	}
 	/*
 	 * The previous behavior in log_console() is preserved when
 	 * log_console_add_linefeed is non-zero.  For that behavior, if an
 	 * individual console write came in that was not terminated with a
 	 * line feed, it would add a line feed.
 	 *
 	 * This results in different data in the message buffer than
 	 * appears on the system console (which doesn't add extra line feed
 	 * characters).
 	 *
 	 * A number of programs and rc scripts write a line feed, or a period
 	 * and a line feed when they have completed their operation.  On
 	 * the console, this looks seamless, but when displayed with
 	 * 'dmesg -a', you wind up with output that looks like this:
 	 *
 	 * Updating motd:
 	 * .
 	 *
 	 * On the console, it looks like this:
 	 * Updating motd:.
 	 *
 	 * We could add logic to detect that situation, or just not insert
 	 * the extra newlines.  Set the kern.log_console_add_linefeed
 	 * sysctl/tunable variable to get the old behavior.
 	 */
 	if (!nl && log_console_add_linefeed) {
 		consbuffer[0] = '\n';
 		consbuffer[1] = '\0';
 		msglogstr(consbuffer, pri, /*filter_cr*/ 1);
 	}
 	msgbuftrigger = 1;
 	free(uio, M_IOV);
 	free(consbuffer, M_TEMP);
 	return;
 }
 
 int
 printf(const char *fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	va_start(ap, fmt);
 	retval = vprintf(fmt, ap);
 	va_end(ap);
 
 	return (retval);
 }
 
 int
 vprintf(const char *fmt, va_list ap)
 {
 	int retval;
 
 	retval = _vprintf(-1, TOCONS | TOLOG, fmt, ap);
 
 	if (!panicstr)
 		msgbuftrigger = 1;
 
 	return (retval);
 }
 
 static void
 putbuf(int c, struct putchar_arg *ap)
 {
 	/* Check if no console output buffer was provided. */
 	if (ap->p_bufr == NULL) {
 		/* Output direct to the console. */
 		if (ap->flags & TOCONS)
 			cnputc(c);
 
 		if (ap->flags & TOLOG)
 			msglogchar(c, ap->pri);
 	} else {
 		/* Buffer the character: */
 		*ap->p_next++ = c;
 		ap->remain--;
 
 		/* Always leave the buffer zero terminated. */
 		*ap->p_next = '\0';
 
 		/* Check if the buffer needs to be flushed. */
 		if (ap->remain == 2 || c == '\n') {
 
 			if (ap->flags & TOLOG)
 				msglogstr(ap->p_bufr, ap->pri, /*filter_cr*/1);
 
 			if (ap->flags & TOCONS) {
 				if ((panicstr == NULL) && (constty != NULL))
 					msgbuf_addstr(&consmsgbuf, -1,
 					    ap->p_bufr, /*filter_cr*/ 0);
 
 				if ((constty == NULL) ||(always_console_output))
 					cnputs(ap->p_bufr);
 			}
 
 			ap->p_next = ap->p_bufr;
 			ap->remain = ap->n_bufr;
 			*ap->p_next = '\0';
 		}
 
 		/*
 		 * Since we fill the buffer up one character at a time,
 		 * this should not happen.  We should always catch it when
 		 * ap->remain == 2 (if not sooner due to a newline), flush
 		 * the buffer and move on.  One way this could happen is
 		 * if someone sets PRINTF_BUFR_SIZE to 1 or something
 		 * similarly silly.
 		 */
 		KASSERT(ap->remain > 2, ("Bad buffer logic, remain = %zd",
 		    ap->remain));
 	}
 }
 
 /*
  * Print a character on console or users terminal.  If destination is
  * the console then the last bunch of characters are saved in msgbuf for
  * inspection later.
  */
 static void
 putchar(int c, void *arg)
 {
 	struct putchar_arg *ap = (struct putchar_arg*) arg;
 	struct tty *tp = ap->tty;
 	int flags = ap->flags;
 
 	/* Don't use the tty code after a panic or while in ddb. */
 	if (kdb_active) {
 		if (c != '\0')
 			cnputc(c);
 		return;
 	}
 
 	if ((flags & TOTTY) && tp != NULL && panicstr == NULL)
 		tty_putchar(tp, c);
 
 	if ((flags & (TOCONS | TOLOG)) && c != '\0')
 		putbuf(c, ap);
 }
 
 /*
  * Scaled down version of sprintf(3).
  */
 int
 sprintf(char *buf, const char *cfmt, ...)
 {
 	int retval;
 	va_list ap;
 
 	va_start(ap, cfmt);
 	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
 	buf[retval] = '\0';
 	va_end(ap);
 	return (retval);
 }
 
 /*
  * Scaled down version of vsprintf(3).
  */
 int
 vsprintf(char *buf, const char *cfmt, va_list ap)
 {
 	int retval;
 
 	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
 	buf[retval] = '\0';
 	return (retval);
 }
 
 /*
  * Scaled down version of snprintf(3).
  */
 int
 snprintf(char *str, size_t size, const char *format, ...)
 {
 	int retval;
 	va_list ap;
 
 	va_start(ap, format);
 	retval = vsnprintf(str, size, format, ap);
 	va_end(ap);
 	return(retval);
 }
 
 /*
  * Scaled down version of vsnprintf(3).
  */
 int
 vsnprintf(char *str, size_t size, const char *format, va_list ap)
 {
 	struct snprintf_arg info;
 	int retval;
 
 	info.str = str;
 	info.remain = size;
 	retval = kvprintf(format, snprintf_func, &info, 10, ap);
 	if (info.remain >= 1)
 		*info.str++ = '\0';
 	return (retval);
 }
 
 /*
  * Kernel version which takes radix argument vsnprintf(3).
  */
 int
 vsnrprintf(char *str, size_t size, int radix, const char *format, va_list ap)
 {
 	struct snprintf_arg info;
 	int retval;
 
 	info.str = str;
 	info.remain = size;
 	retval = kvprintf(format, snprintf_func, &info, radix, ap);
 	if (info.remain >= 1)
 		*info.str++ = '\0';
 	return (retval);
 }
 
 static void
 snprintf_func(int ch, void *arg)
 {
 	struct snprintf_arg *const info = arg;
 
 	if (info->remain >= 2) {
 		*info->str++ = ch;
 		info->remain--;
 	}
 }
 
 /*
  * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
  * order; return an optional length and a pointer to the last character
  * written in the buffer (i.e., the first character of the string).
  * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
  */
 static char *
 ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper)
 {
 	char *p, c;
 
 	p = nbuf;
 	*p = '\0';
 	do {
 		c = hex2ascii(num % base);
 		*++p = upper ? toupper(c) : c;
 	} while (num /= base);
 	if (lenp)
 		*lenp = p - nbuf;
 	return (p);
 }
 
 /*
  * Scaled down version of printf(3).
  *
  * Two additional formats:
  *
  * The format %b is supported to decode error registers.
  * Its usage is:
  *
  *	printf("reg=%b\n", regval, "<base><arg>*");
  *
  * where <base> is the output base expressed as a control character, e.g.
  * \10 gives octal; \20 gives hex.  Each arg is a sequence of characters,
  * the first of which gives the bit number to be inspected (origin 1), and
  * the next characters (up to a control character, i.e. a character <= 32),
  * give the name of the register.  Thus:
  *
  *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE");
  *
  * would produce output:
  *
  *	reg=3<BITTWO,BITONE>
  *
  * XXX:  %D  -- Hexdump, takes pointer and separator string:
  *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
  *		("%*D", len, ptr, " " -> XX XX XX XX ...
  */
 int
 kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
 {
 #define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
 	char nbuf[MAXNBUF];
 	char *d;
 	const char *p, *percent, *q;
 	u_char *up;
 	int ch, n;
 	uintmax_t num;
 	int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
 	int cflag, hflag, jflag, tflag, zflag;
 	int dwidth, upper;
 	char padc;
 	int stop = 0, retval = 0;
 
 	num = 0;
 	if (!func)
 		d = (char *) arg;
 	else
 		d = NULL;
 
 	if (fmt == NULL)
 		fmt = "(fmt null)\n";
 
 	if (radix < 2 || radix > 36)
 		radix = 10;
 
 	for (;;) {
 		padc = ' ';
 		width = 0;
 		while ((ch = (u_char)*fmt++) != '%' || stop) {
 			if (ch == '\0')
 				return (retval);
 			PCHAR(ch);
 		}
 		percent = fmt - 1;
 		qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
 		sign = 0; dot = 0; dwidth = 0; upper = 0;
 		cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
 reswitch:	switch (ch = (u_char)*fmt++) {
 		case '.':
 			dot = 1;
 			goto reswitch;
 		case '#':
 			sharpflag = 1;
 			goto reswitch;
 		case '+':
 			sign = 1;
 			goto reswitch;
 		case '-':
 			ladjust = 1;
 			goto reswitch;
 		case '%':
 			PCHAR(ch);
 			break;
 		case '*':
 			if (!dot) {
 				width = va_arg(ap, int);
 				if (width < 0) {
 					ladjust = !ladjust;
 					width = -width;
 				}
 			} else {
 				dwidth = va_arg(ap, int);
 			}
 			goto reswitch;
 		case '0':
 			if (!dot) {
 				padc = '0';
 				goto reswitch;
 			}
 		case '1': case '2': case '3': case '4':
 		case '5': case '6': case '7': case '8': case '9':
 				for (n = 0;; ++fmt) {
 					n = n * 10 + ch - '0';
 					ch = *fmt;
 					if (ch < '0' || ch > '9')
 						break;
 				}
 			if (dot)
 				dwidth = n;
 			else
 				width = n;
 			goto reswitch;
 		case 'b':
 			num = (u_int)va_arg(ap, int);
 			p = va_arg(ap, char *);
 			for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;)
 				PCHAR(*q--);
 
 			if (num == 0)
 				break;
 
 			for (tmp = 0; *p;) {
 				n = *p++;
 				if (num & (1 << (n - 1))) {
 					PCHAR(tmp ? ',' : '<');
 					for (; (n = *p) > ' '; ++p)
 						PCHAR(n);
 					tmp = 1;
 				} else
 					for (; *p > ' '; ++p)
 						continue;
 			}
 			if (tmp)
 				PCHAR('>');
 			break;
 		case 'c':
 			width -= 1;
 
 			if (!ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			PCHAR(va_arg(ap, int));
 			if (ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			break;
 		case 'D':
 			up = va_arg(ap, u_char *);
 			p = va_arg(ap, char *);
 			if (!width)
 				width = 16;
 			while(width--) {
 				PCHAR(hex2ascii(*up >> 4));
 				PCHAR(hex2ascii(*up & 0x0f));
 				up++;
 				if (width)
 					for (q=p;*q;q++)
 						PCHAR(*q);
 			}
 			break;
 		case 'd':
 		case 'i':
 			base = 10;
 			sign = 1;
 			goto handle_sign;
 		case 'h':
 			if (hflag) {
 				hflag = 0;
 				cflag = 1;
 			} else
 				hflag = 1;
 			goto reswitch;
 		case 'j':
 			jflag = 1;
 			goto reswitch;
 		case 'l':
 			if (lflag) {
 				lflag = 0;
 				qflag = 1;
 			} else
 				lflag = 1;
 			goto reswitch;
 		case 'n':
 			if (jflag)
 				*(va_arg(ap, intmax_t *)) = retval;
 			else if (qflag)
 				*(va_arg(ap, quad_t *)) = retval;
 			else if (lflag)
 				*(va_arg(ap, long *)) = retval;
 			else if (zflag)
 				*(va_arg(ap, size_t *)) = retval;
 			else if (hflag)
 				*(va_arg(ap, short *)) = retval;
 			else if (cflag)
 				*(va_arg(ap, char *)) = retval;
 			else
 				*(va_arg(ap, int *)) = retval;
 			break;
 		case 'o':
 			base = 8;
 			goto handle_nosign;
 		case 'p':
 			base = 16;
 			sharpflag = (width == 0);
 			sign = 0;
 			num = (uintptr_t)va_arg(ap, void *);
 			goto number;
 		case 'q':
 			qflag = 1;
 			goto reswitch;
 		case 'r':
 			base = radix;
 			if (sign)
 				goto handle_sign;
 			goto handle_nosign;
 		case 's':
 			p = va_arg(ap, char *);
 			if (p == NULL)
 				p = "(null)";
 			if (!dot)
 				n = strlen (p);
 			else
 				for (n = 0; n < dwidth && p[n]; n++)
 					continue;
 
 			width -= n;
 
 			if (!ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			while (n--)
 				PCHAR(*p++);
 			if (ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			break;
 		case 't':
 			tflag = 1;
 			goto reswitch;
 		case 'u':
 			base = 10;
 			goto handle_nosign;
 		case 'X':
 			upper = 1;
 		case 'x':
 			base = 16;
 			goto handle_nosign;
 		case 'y':
 			base = 16;
 			sign = 1;
 			goto handle_sign;
 		case 'z':
 			zflag = 1;
 			goto reswitch;
 handle_nosign:
 			sign = 0;
 			if (jflag)
 				num = va_arg(ap, uintmax_t);
 			else if (qflag)
 				num = va_arg(ap, u_quad_t);
 			else if (tflag)
 				num = va_arg(ap, ptrdiff_t);
 			else if (lflag)
 				num = va_arg(ap, u_long);
 			else if (zflag)
 				num = va_arg(ap, size_t);
 			else if (hflag)
 				num = (u_short)va_arg(ap, int);
 			else if (cflag)
 				num = (u_char)va_arg(ap, int);
 			else
 				num = va_arg(ap, u_int);
 			goto number;
 handle_sign:
 			if (jflag)
 				num = va_arg(ap, intmax_t);
 			else if (qflag)
 				num = va_arg(ap, quad_t);
 			else if (tflag)
 				num = va_arg(ap, ptrdiff_t);
 			else if (lflag)
 				num = va_arg(ap, long);
 			else if (zflag)
 				num = va_arg(ap, ssize_t);
 			else if (hflag)
 				num = (short)va_arg(ap, int);
 			else if (cflag)
 				num = (char)va_arg(ap, int);
 			else
 				num = va_arg(ap, int);
 number:
 			if (sign && (intmax_t)num < 0) {
 				neg = 1;
 				num = -(intmax_t)num;
 			}
 			p = ksprintn(nbuf, num, base, &n, upper);
 			tmp = 0;
 			if (sharpflag && num != 0) {
 				if (base == 8)
 					tmp++;
 				else if (base == 16)
 					tmp += 2;
 			}
 			if (neg)
 				tmp++;
 
 			if (!ladjust && padc == '0')
 				dwidth = width - tmp;
 			width -= tmp + imax(dwidth, n);
 			dwidth -= n;
 			if (!ladjust)
 				while (width-- > 0)
 					PCHAR(' ');
 			if (neg)
 				PCHAR('-');
 			if (sharpflag && num != 0) {
 				if (base == 8) {
 					PCHAR('0');
 				} else if (base == 16) {
 					PCHAR('0');
 					PCHAR('x');
 				}
 			}
 			while (dwidth-- > 0)
 				PCHAR('0');
 
 			while (*p)
 				PCHAR(*p--);
 
 			if (ladjust)
 				while (width-- > 0)
 					PCHAR(' ');
 
 			break;
 		default:
 			while (percent < fmt)
 				PCHAR(*percent++);
 			/*
 			 * Since we ignore a formatting argument it is no
 			 * longer safe to obey the remaining formatting
 			 * arguments as the arguments will no longer match
 			 * the format specs.
 			 */
 			stop = 1;
 			break;
 		}
 	}
 #undef PCHAR
 }
 
 /*
  * Put character in log buffer with a particular priority.
  */
 static void
 msglogchar(int c, int pri)
 {
 	static int lastpri = -1;
 	static int dangling;
 	char nbuf[MAXNBUF];
 	char *p;
 
 	if (!msgbufmapped)
 		return;
 	if (c == '\0' || c == '\r')
 		return;
 	if (pri != -1 && pri != lastpri) {
 		if (dangling) {
 			msgbuf_addchar(msgbufp, '\n');
 			dangling = 0;
 		}
 		msgbuf_addchar(msgbufp, '<');
 		for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;)
 			msgbuf_addchar(msgbufp, *p--);
 		msgbuf_addchar(msgbufp, '>');
 		lastpri = pri;
 	}
 	msgbuf_addchar(msgbufp, c);
 	if (c == '\n') {
 		dangling = 0;
 		lastpri = -1;
 	} else {
 		dangling = 1;
 	}
 }
 
 static void
 msglogstr(char *str, int pri, int filter_cr)
 {
 	if (!msgbufmapped)
 		return;
 
 	msgbuf_addstr(msgbufp, pri, str, filter_cr);
 }
 
 void
 msgbufinit(void *ptr, int size)
 {
 	char *cp;
 	static struct msgbuf *oldp = NULL;
 
 	size -= sizeof(*msgbufp);
 	cp = (char *)ptr;
 	msgbufp = (struct msgbuf *)(cp + size);
 	msgbuf_reinit(msgbufp, cp, size);
 	if (msgbufmapped && oldp != msgbufp)
 		msgbuf_copy(oldp, msgbufp);
 	msgbufmapped = 1;
 	oldp = msgbufp;
 }
 
 static int unprivileged_read_msgbuf = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf,
     CTLFLAG_RW, &unprivileged_read_msgbuf, 0,
     "Unprivileged processes may read the kernel message buffer");
 
 /* Sysctls for accessing/clearing the msgbuf */
 static int
 sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS)
 {
 	char buf[128];
 	u_int seq;
 	int error, len;
 
 	if (!unprivileged_read_msgbuf) {
 		error = priv_check(req->td, PRIV_MSGBUF);
 		if (error)
 			return (error);
 	}
 
 	/* Read the whole buffer, one chunk at a time. */
 	mtx_lock(&msgbuf_lock);
 	msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
 	for (;;) {
 		len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
 		mtx_unlock(&msgbuf_lock);
 		if (len == 0)
 			return (SYSCTL_OUT(req, "", 1)); /* add nulterm */
 
 		error = sysctl_handle_opaque(oidp, buf, len, req);
 		if (error)
 			return (error);
 
 		mtx_lock(&msgbuf_lock);
 	}
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, msgbuf,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer");
 
 static int msgbuf_clearflag;
 
 static int
 sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (!error && req->newptr) {
 		mtx_lock(&msgbuf_lock);
 		msgbuf_clear(msgbufp);
 		mtx_unlock(&msgbuf_lock);
 		msgbuf_clearflag = 0;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE,
     &msgbuf_clearflag, 0, sysctl_kern_msgbuf_clear, "I",
     "Clear kernel message buffer");
 
 #ifdef DDB
 
 DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
 {
 	int i, j;
 
 	if (!msgbufmapped) {
 		db_printf("msgbuf not mapped yet\n");
 		return;
 	}
 	db_printf("msgbufp = %p\n", msgbufp);
 	db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n",
 	    msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq,
 	    msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum);
 	for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) {
 		j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq);
 		db_printf("%c", msgbufp->msg_ptr[j]);
 	}
 	db_printf("\n");
 }
 
 #endif /* DDB */
 
 void
 hexdump(const void *ptr, int length, const char *hdr, int flags)
 {
 	int i, j, k;
 	int cols;
 	const unsigned char *cp;
 	char delim;
 
 	if ((flags & HD_DELIM_MASK) != 0)
 		delim = (flags & HD_DELIM_MASK) >> 8;
 	else
 		delim = ' ';
 
 	if ((flags & HD_COLUMN_MASK) != 0)
 		cols = flags & HD_COLUMN_MASK;
 	else
 		cols = 16;
 
 	cp = ptr;
 	for (i = 0; i < length; i+= cols) {
 		if (hdr != NULL)
 			printf("%s", hdr);
 
 		if ((flags & HD_OMIT_COUNT) == 0)
 			printf("%04x  ", i);
 
 		if ((flags & HD_OMIT_HEX) == 0) {
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k < length)
 					printf("%c%02x", delim, cp[k]);
 				else
 					printf("   ");
 			}
 		}
 
 		if ((flags & HD_OMIT_CHARS) == 0) {
 			printf("  |");
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k >= length)
 					printf(" ");
 				else if (cp[k] >= ' ' && cp[k] <= '~')
 					printf("%c", cp[k]);
 				else
 					printf(".");
 			}
 			printf("|");
 		}
 		printf("\n");
 	}
 }
 #endif /* _KERNEL */
 
 void
 sbuf_hexdump(struct sbuf *sb, const void *ptr, int length, const char *hdr,
 	     int flags)
 {
 	int i, j, k;
 	int cols;
 	const unsigned char *cp;
 	char delim;
 
 	if ((flags & HD_DELIM_MASK) != 0)
 		delim = (flags & HD_DELIM_MASK) >> 8;
 	else
 		delim = ' ';
 
 	if ((flags & HD_COLUMN_MASK) != 0)
 		cols = flags & HD_COLUMN_MASK;
 	else
 		cols = 16;
 
 	cp = ptr;
 	for (i = 0; i < length; i+= cols) {
 		if (hdr != NULL)
 			sbuf_printf(sb, "%s", hdr);
 
 		if ((flags & HD_OMIT_COUNT) == 0)
 			sbuf_printf(sb, "%04x  ", i);
 
 		if ((flags & HD_OMIT_HEX) == 0) {
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k < length)
 					sbuf_printf(sb, "%c%02x", delim, cp[k]);
 				else
 					sbuf_printf(sb, "   ");
 			}
 		}
 
 		if ((flags & HD_OMIT_CHARS) == 0) {
 			sbuf_printf(sb, "  |");
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k >= length)
 					sbuf_printf(sb, " ");
 				else if (cp[k] >= ' ' && cp[k] <= '~')
 					sbuf_printf(sb, "%c", cp[k]);
 				else
 					sbuf_printf(sb, ".");
 			}
 			sbuf_printf(sb, "|");
 		}
 		sbuf_printf(sb, "\n");
 	}
 }
 
 #ifdef _KERNEL
 void
 counted_warning(unsigned *counter, const char *msg)
 {
 	struct thread *td;
 	unsigned c;
 
 	for (;;) {
 		c = *counter;
 		if (c == 0)
 			break;
 		if (atomic_cmpset_int(counter, c, c - 1)) {
 			td = curthread;
 			log(LOG_INFO, "pid %d (%s) %s%s\n",
 			    td->td_proc->p_pid, td->td_name, msg,
 			    c > 1 ? "" : " - not logging anymore");
 			break;
 		}
 	}
 }
 #endif
Index: user/alc/PQ_LAUNDRY/sys/netinet/ip_output.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/netinet/ip_output.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/netinet/ip_output.c	(revision 307896)
@@ -1,1414 +1,1417 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/flowtable.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #endif /* IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef MBUF_STRESS_TEST
 static int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback(struct ifnet *, const struct mbuf *, int);
 
 
 extern int in_mcast_loop;
 extern	struct protosw inetsw[];
 
 static inline int
 ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp,
     struct sockaddr_in *dst, int *fibnum, int *error)
 {
 	struct m_tag *fwd_tag = NULL;
 	struct mbuf *m;
 	struct in_addr odst;
 	struct ip *ip;
 
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	*error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, inp);
 	m = *mp;
 	if ((*error) != 0 || m == NULL)
 		return 1; /* Finished */
 
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 					CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 				CSUM_IP_CHECKED | CSUM_IP_VALID;
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			*error = netisr_queue(NETISR_IP, m);
 			return 1; /* Finished */
 		}
 
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 
 		return -1; /* Reloop */
 	}
 	/* See if fib was changed by packet filter. */
 	if ((*fibnum) != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		*fibnum = M_GETFIB(m);
 		return -1; /* Reloop for FIB change */
 	}
 
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 				CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		m->m_pkthdr.csum_flags |=
 			CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		*error = netisr_queue(NETISR_IP, m);
 		return 1; /* Finished */
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 
 		return -1; /* Reloop for CHANGE of dst */
 	}
 
 	return 0;
 }
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu;
 	int error = 0;
 	struct sockaddr_in *dst;
 	const struct sockaddr_in *gw;
 	struct in_ifaddr *ia;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
 	struct route iproute;
 	struct rtentry *rte;	/* cache for ro->ro_rt */
 	uint32_t fibnum;
 	int have_ia_ref;
 #ifdef IPSEC
 	int no_route_but_check_spd = 0;
 #endif
 	M_ASSERTPKTHDR(m);
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 	}
 
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
 	} else
 		ro->ro_flags |= RT_LLE_CACHE;
 
 #ifdef FLOWTABLE
 	if (ro->ro_rt == NULL)
 		(void )flowtable_lookup(AF_INET, m, ro);
 #endif
 
 	if (opt) {
 		int len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len; /* ip->ip_hl is updated above */
 	}
 	ip = mtod(m, struct ip *);
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip_fillid(ip);
 		IPSTAT_INC(ips_localout);
 	} else {
 		/* Header already set, fetch hlen from there */
 		hlen = ip->ip_hl << 2;
 	}
 
 	/*
 	 * dst/gw handling:
 	 *
 	 * dst can be rewritten but always points to &ro->ro_dst.
 	 * gw is readonly but can point either to dst OR rt_gateway,
 	 * therefore we need restore gw if we're redoing lookup.
 	 */
 	gw = dst = (struct sockaddr_in *)&ro->ro_dst;
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 	rte = ro->ro_rt;
 	if (rte == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 again:
 	/*
 	 * Validate route against routing table additions;
 	 * a better/more specific route might have been added.
 	 */
 	if (inp)
 		RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing the
 	 * cache with IPv6.
 	 * Also check whether routing cache needs invalidation.
 	 */
 	rte = ro->ro_rt;
 	if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp) ||
 			  dst->sin_family != AF_INET ||
 			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
 		RTFREE(rte);
 		rte = ro->ro_rt = (struct rtentry *)NULL;
 		if (ro->ro_lle)
 			LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
 		ro->ro_lle = (struct llentry *)NULL;
 	}
 	ia = NULL;
 	have_ia_ref = 0;
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
 		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
 						      M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		have_ia_ref = 1;
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 	} else if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
 						M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		have_ia_ref = 1;
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
-		isbroadcast = in_ifaddr_broadcast(dst->sin_addr, ia);
+		isbroadcast = ifp->if_flags & IFF_BROADCAST ?
+		    in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia)
 			have_ia_ref = 1;
 		isbroadcast = 0;	/* fool gcc */
 	} else {
 		/*
 		 * We want to do any cloning requested by the link layer,
 		 * as this is probably required in all cases for correct
 		 * operation (as it is for ARP).
 		 */
 		if (rte == NULL) {
 #ifdef RADIX_MPATH
 			rtalloc_mpath_fib(ro,
 			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 			    fibnum);
 #else
 			in_rtalloc_ign(ro, 0, fibnum);
 #endif
 			rte = ro->ro_rt;
 		}
 		if (rte == NULL ||
 		    (rte->rt_flags & RTF_UP) == 0 ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp)) {
 #ifdef IPSEC
 			/*
 			 * There is no route for this packet, but it is
 			 * possible that a matching SPD entry exists.
 			 */
 			no_route_but_check_spd = 1;
 			mtu = 0; /* Silence GCC warning. */
 			goto sendit;
 #endif
 			IPSTAT_INC(ips_noroute);
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ia = ifatoia(rte->rt_ifa);
 		ifp = rte->rt_ifp;
 		counter_u64_add(rte->rt_pksent, 1);
 		rt_update_ro_flags(ro);
 		if (rte->rt_flags & RTF_GATEWAY)
 			gw = (struct sockaddr_in *)rte->rt_gateway;
 		if (rte->rt_flags & RTF_HOST)
 			isbroadcast = (rte->rt_flags & RTF_BROADCAST);
-		else
+		else if (ifp->if_flags & IFF_BROADCAST)
 			isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia);
+		else
+			isbroadcast = 0;
 	}
 
 	/*
 	 * Calculate MTU.  If we have a route that is up, use that,
 	 * otherwise use the interface's MTU.
 	 */
 	if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST)))
 		mtu = rte->rt_mtu;
 	else
 		mtu = ifp->if_mtu;
 	/* Catch a possible divide by zero later. */
 	KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p",
 	    __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp));
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "gw"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		gw = dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				IPSTAT_INC(ips_noroute);
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY) {
 			/* Interface may have no addresses. */
 			if (ia != NULL)
 				ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 
 		if ((imo == NULL && in_mcast_loop) ||
 		    (imo && imo->imo_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we are not a member
 			 * of the group; ip_input() will filter it later,
 			 * thus deferring a hash lookup and mutex acquisition
 			 * at the expense of a cheap copy using m_copym().
 			 */
 			ip_mloopback(ifp, m, hlen);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy. ip_input() will drop the copy if
 		 * this host does not belong to the destination group on
 		 * the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY) {
 		/* Interface may have no addresses. */
 		if (ia != NULL) {
 			ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 	}
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #ifdef IPSEC
 	switch(ip_ipsec_output(&m, inp, &error)) {
 	case 1:
 		goto bad;
 	case -1:
 		goto done;
 	case 0:
 	default:
 		break;	/* Continue with packet processing. */
 	}
 	/*
 	 * Check if there was a route for this packet; return error if not.
 	 */
 	if (no_route_but_check_spd) {
 		IPSTAT_INC(ips_noroute);
 		error = EHOSTUNREACH;
 		goto bad;
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (PFIL_HOOKED(&V_inet_pfil_hook)) {
 		switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) {
 		case 1: /* Finished */
 			goto done;
 
 		case 0: /* Continue normally */
 			ip = mtod(m, struct ip *);
 			break;
 
 		case -1: /* Need to try again */
 			/* Reset everything for a new round */
 			RO_RTFREE(ro);
 			if (have_ia_ref)
 				ifa_free(&ia->ia_ifa);
 			ro->ro_prepend = NULL;
 			rte = NULL;
 			gw = dst;
 			ip = mtod(m, struct ip *);
 			goto again;
 
 		}
 	}
 
 	/* 127/8 must not appear on wire - RFC1122. */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
 		ip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags & CSUM_TSO)
 				counter_u64_add(ia->ia_ifa.ifa_opackets,
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
 			else
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 
 			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = (*ifp->if_output)(ifp, m,
 		    (const struct sockaddr *)gw, ro);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		IPSTAT_INC(ips_cantfrag);
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m_clrprotoflags(m);
 
 			IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 			error = (*ifp->if_output)(ifp, m,
 			    (const struct sockaddr *)gw, ro);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IPSTAT_INC(ips_fragmented);
 
 done:
 	if (ro == &iproute)
 		RO_RTFREE(ro);
 	else if (rte == NULL)
 		/*
 		 * If the caller supplied a route but somehow the reference
 		 * to it has been released need to prevent the caller
 		 * calling RTFREE on it again.
 		 */
 		ro->ro_rt = NULL;
 	if (have_ia_ref)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags)
 {
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 	uint16_t ip_len, ip_off;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if (ip_off & IP_DF) {	/* Fragmentation not allowed */
 		IPSTAT_INC(ips_cantfrag);
 		return EMSGSIZE;
 	}
 
 	/*
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (len < 8)
 		return EMSGSIZE;
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	if (len > PAGE_SIZE) {
 		/*
 		 * Fragment large datagrams such that each segment
 		 * contains a multiple of PAGE_SIZE amount of data,
 		 * plus headers. This enables a receiver to perform
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 
 		off = MIN(mtu, m0->m_pkthdr.len);
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copym().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
 		if (off + len >= ip_len)
 			len = ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copym(m0, off, len, M_NOWAIT);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 			mhip->ip_sum = in_cksum(m, mhlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	IPSTAT_ADD(ips_ofragments, nfrags);
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off = htons(ip_off | IP_MF);
 	ip->ip_sum = 0;
 	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 		ip->ip_sum = in_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 	}
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	uint16_t csum, offset, ip_len;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 	ip_len = ntohs(ip->ip_len);
 	csum = in_cksum_skip(m, ip_len, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	/* find the mbuf in the chain where the checksum starts*/
 	while ((m != NULL) && (offset >= m->m_len)) {
 		offset -= m->m_len;
 		m = m->m_next;
 	}
 	KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain."));
 	KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs."));
 	*(u_short *)(m->m_data + offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			default:
 				break;
 			}
 		}
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_BINDANY:
 			if (sopt->sopt_td != NULL) {
 				error = priv_check(sopt->sopt_td,
 				    PRIV_NETINET_BINDANY);
 				if (error)
 					break;
 			}
 			/* FALLTHROUGH */
 		case IP_BINDMULTI:
 #ifdef	RSS
 		case IP_RSS_LISTEN_BUCKET:
 #endif
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_RECVTOS:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RECVRSSBUCKETID:
 #endif
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval >= 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 #define	OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
 			case IP_BINDMULTI:
 				OPTSET2(INP_BINDMULTI, optval);
 				break;
 			case IP_RECVFLOWID:
 				OPTSET2(INP_RECVFLOWID, optval);
 				break;
 #ifdef	RSS
 			case IP_RSS_LISTEN_BUCKET:
 				if ((optval >= 0) &&
 				    (optval < rss_getnumbuckets())) {
 					inp->inp_rss_listen_bucket = optval;
 					OPTSET2(INP_RSS_BUCKET_SET, 1);
 				} else {
 					error = EINVAL;
 				}
 				break;
 			case IP_RECVRSSBUCKETID:
 				OPTSET2(INP_RECVRSSBUCKETID, optval);
 				break;
 #endif
 			}
 			break;
 #undef OPTSET
 #undef OPTSET2
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			caddr_t req;
 			struct mbuf *m;
 
 			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 				break;
 			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 				break;
 			req = mtod(m, caddr_t);
 			error = ipsec_set_policy(inp, sopt->sopt_name, req,
 			    m->m_len, (sopt->sopt_td != NULL) ?
 			    sopt->sopt_td->td_ucred : NULL);
 			m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			if (inp->inp_options)
 				error = sooptcopyout(sopt,
 						     mtod(inp->inp_options,
 							  char *),
 						     inp->inp_options->m_len);
 			else
 				sopt->sopt_valsize = 0;
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
 		case IP_BINDMULTI:
 		case IP_FLOWID:
 		case IP_FLOWTYPE:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RSSBUCKETID:
 		case IP_RECVRSSBUCKETID:
 #endif
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 #define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				optval = OPTBIT(INP_RECVTOS);
 				break;
 			case IP_FLOWID:
 				optval = inp->inp_flowid;
 				break;
 			case IP_FLOWTYPE:
 				optval = inp->inp_flowtype;
 				break;
 			case IP_RECVFLOWID:
 				optval = OPTBIT2(INP_RECVFLOWID);
 				break;
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				retval = rss_hash2bucket(inp->inp_flowid,
 				    inp->inp_flowtype,
 				    &rss_bucket);
 				if (retval == 0)
 					optval = rss_bucket;
 				else
 					error = EINVAL;
 				break;
 			case IP_RECVRSSBUCKETID:
 				optval = OPTBIT2(INP_RECVRSSBUCKETID);
 				break;
 #endif
 			case IP_BINDMULTI:
 				optval = OPTBIT2(INP_BINDMULTI);
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			struct mbuf *m = NULL;
 			caddr_t req = NULL;
 			size_t len = 0;
 
 			if (m != NULL) {
 				req = mtod(m, caddr_t);
 				len = m->m_len;
 			}
 			error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
 			if (error == 0)
 				error = soopt_mcopyout(sopt, m); /* XXX */
 			if (error == 0)
 				m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
 {
 	struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_NOWAIT);
 	if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 		if_simloop(ifp, copym, AF_INET, 0);
 	}
 }
Index: user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_tables.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_tables.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_tables.c	(revision 307896)
@@ -1,137 +1,138 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_tables.c	8.1 (Berkeley) 6/11/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/types.h>
 #include <ufs/ufs/dinode.h>
 #include <ufs/ffs/fs.h>
 
 /*
  * Bit patterns for identifying fragments in the block map
  * used as ((map & around) == inside)
  */
 int around[9] = {
 	0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
 };
 int inside[9] = {
 	0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
 };
 
 /*
  * Given a block map bit pattern, the frag tables tell whether a
  * particular size fragment is available.
  *
  * used as:
  * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] {
  *	at least one fragment of the indicated size is available
  * }
  *
  * These tables are used by the scanc instruction on the VAX to
  * quickly find an appropriate fragment.
  */
 static u_char fragtbl124[256] = {
 	0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e,
 	0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a,
 	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
 	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
 	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
 	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
 	0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e,
 	0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa,
 	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
 	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
 	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
 	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
 	0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e,
 	0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae,
 	0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e,
 	0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce,
 	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
 	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
 	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
 	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
 	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
 	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
 	0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e,
 	0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe,
 	0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e,
 	0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa,
 	0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e,
 	0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe,
 	0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e,
 	0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce,
 	0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce,
 	0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a,
 };
 
 static u_char fragtbl8[256] = {
 	0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04,
 	0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08,
 	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
 	0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10,
 	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
 	0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
 	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
 	0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20,
 	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
 	0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
 	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
 	0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
 	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
 	0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a,
 	0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04,
 	0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40,
 	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
 	0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
 	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
 	0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
 	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
 	0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
 	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07,
 	0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21,
 	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
 	0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a,
 	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07,
 	0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12,
 	0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04,
 	0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c,
 	0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c,
 	0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80,
 };
 
 /*
  * The actual fragtbl array.
  */
 u_char *fragtbl[MAXFRAG + 1] = {
 	0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8,
 };
Index: user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h
===================================================================
--- user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h	(revision 307896)
@@ -1,121 +1,120 @@
 /*-
  * Copyright (c) 1995 Bruce D. Evans.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _X86_X86_VAR_H_
 #define	_X86_X86_VAR_H_
 
 /*
  * Miscellaneous machine-dependent declarations.
  */
 
 extern	long	Maxmem;
 extern	u_int	basemem;
 extern	int	busdma_swi_pending;
 extern	u_int	cpu_exthigh;
 extern	u_int	cpu_feature;
 extern	u_int	cpu_feature2;
 extern	u_int	amd_feature;
 extern	u_int	amd_feature2;
 extern	u_int	amd_pminfo;
 extern	u_int	via_feature_rng;
 extern	u_int	via_feature_xcrypt;
 extern	u_int	cpu_clflush_line_size;
 extern	u_int	cpu_stdext_feature;
 extern	u_int	cpu_stdext_feature2;
 extern	u_int	cpu_fxsr;
 extern	u_int	cpu_high;
 extern	u_int	cpu_id;
 extern	u_int	cpu_max_ext_state_size;
 extern	u_int	cpu_mxcsr_mask;
 extern	u_int	cpu_procinfo;
 extern	u_int	cpu_procinfo2;
 extern	char	cpu_vendor[];
 extern	u_int	cpu_vendor_id;
 extern	u_int	cpu_mon_mwait_flags;
 extern	u_int	cpu_mon_min_size;
 extern	u_int	cpu_mon_max_size;
 extern	u_int	cpu_maxphyaddr;
 extern	char	ctx_switch_xsave[];
 extern	u_int	hv_high;
 extern	char	hv_vendor[];
 extern	char	kstack[];
 extern	char	sigcode[];
 extern	int	szsigcode;
 extern	int	vm_page_dump_size;
 extern	int	workaround_erratum383;
 extern	int	_udatasel;
 extern	int	_ucodesel;
 extern	int	_ucode32sel;
 extern	int	_ufssel;
 extern	int	_ugssel;
 extern	int	use_xsave;
 extern	uint64_t xsave_mask;
 
 struct	pcb;
 struct	thread;
 struct	reg;
 struct	fpreg;
 struct  dbreg;
 struct	dumperinfo;
 struct	trapframe;
 
 /*
  * The interface type of the interrupt handler entry point cannot be
  * expressed in C.  Use simplest non-variadic function type as an
  * approximation.
  */
 typedef void alias_for_inthand_t(void);
 
 void	*alloc_fpusave(int flags);
 void	busdma_swi(void);
 bool	cpu_mwait_usable(void);
 void	cpu_probe_amdc1e(void);
 void	cpu_setregs(void);
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
 void	identify_cpu(void);
 void	initializecpu(void);
 void	initializecpucache(void);
 bool	fix_cpuid(void);
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 int	is_physical_memory(vm_paddr_t addr);
 int	isa_nmi(int cd);
-bool	nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame,
-	    bool panic);
-bool	nmi_call_kdb_smp(u_int type, struct trapframe *frame, bool panic);
-int	nmi_handle_intr(u_int type, struct trapframe *frame, bool panic);
+void	nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame);
+void	nmi_call_kdb_smp(u_int type, struct trapframe *frame);
+void	nmi_handle_intr(u_int type, struct trapframe *frame);
 void	pagecopy(void *from, void *to);
 void	printcpuinfo(void);
 int	user_dbreg_trap(void);
 int	minidumpsys(struct dumperinfo *);
 struct pcb *get_pcb_td(struct thread *td);
 
 #endif
Index: user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c	(revision 307896)
@@ -1,574 +1,577 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_platform.h"
 #ifdef __i386__
 #include "opt_npx.h"
 #include "opt_apic.h"
 #include "opt_xbox.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/specialreg.h>
 #include <machine/md_var.h>
 #include <machine/mp_watchdog.h>
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <x86/acpica_machdep.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 void
 acpi_cpu_c1(void)
 {
 
 	__asm __volatile("sti; hlt");
 }
 
 void
 acpi_cpu_idle_mwait(uint32_t mwait_hint)
 {
 	int *state;
 
 	/*
 	 * XXXKIB.  Software coordination mode should be supported,
 	 * but all Intel CPUs provide hardware coordination.
 	 */
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	KASSERT(*state == STATE_SLEEPING,
 		("cpu_mwait_cx: wrong monitorbuf state"));
 	*state = STATE_MWAIT;
 	cpu_monitor(state, 0, 0);
 	if (*state == STATE_MWAIT)
 		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
 
 	/*
 	 * We should exit on any event that interrupts mwait, because
 	 * that event might be a wanted interrupt.
 	 */
 	*state = STATE_RUNNING;
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	uint64_t acnt, mcnt, perf;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 #ifdef __i386__
 	if ((cpu_feature & CPUID_TSC) == 0)
 		return (EOPNOTSUPP);
 #endif
 
 	/*
 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 	 * DELAY(9) based logic fails.
 	 */
 	if (tsc_is_invariant && !tsc_perf_stat)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	if (tsc_is_invariant) {
 		wrmsr(MSR_MPERF, 0);
 		wrmsr(MSR_APERF, 0);
 		tsc1 = rdtsc();
 		DELAY(1000);
 		mcnt = rdmsr(MSR_MPERF);
 		acnt = rdmsr(MSR_APERF);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		perf = 1000 * acnt / mcnt;
 		*rate = (tsc2 - tsc1) * perf;
 	} else {
 		tsc1 = rdtsc();
 		DELAY(1000);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		*rate = (tsc2 - tsc1) * 1000;
 	}
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		halt();
 }
 
 bool
 cpu_mwait_usable(void)
 {
 
 	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
 }
 
 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 #ifndef PC98
 static void
 cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
 		cpu_idle_hook(sbt);
 	else
 		acpi_cpu_c1();
 	*state = STATE_RUNNING;
 }
 #endif /* !PC98 */
 
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
 	 * an interrupt fires during that critical section we may have
 	 * a pending preemption.  If the CPU halts, then that thread
 	 * may not execute until a later interrupt awakens the CPU.
 	 * To handle this race, check for a runnable thread after
 	 * disabling interrupts and immediately return if one is
 	 * found.  Also, we must absolutely guarentee that hlt is
 	 * the next instruction after sti.  This ensures that any
 	 * interrupt that fires after the call to disable_intr() will
 	 * immediately awaken the CPU from hlt.  Finally, please note
 	 * that on x86 this works fine because of interrupts enabled only
 	 * after the instruction following sti takes place, while IF is set
 	 * to 1 immediately, allowing hlt instruction to acknowledge the
 	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		acpi_cpu_c1();
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_MWAIT;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable()) {
 		enable_intr();
 		*state = STATE_RUNNING;
 		return;
 	}
 	cpu_monitor(state, 0, 0);
 	if (*state == STATE_MWAIT)
 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 	else
 		enable_intr();
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_RUNNING;
 
 	/*
 	 * The sched_runnable() call is racy but as long as there is
 	 * a loop missing it one time will have just a little impact if any 
 	 * (and it is much better than missing the check at all).
 	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 /*
  * C1E renders the local APIC timer dead, so we disable it by
  * reading the Interrupt Pending Message register and clearing
  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  * 
  * Reference:
  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  *   #32559 revision 3.00+
  */
 #define	MSR_AMDK8_IPM		0xc0010055
 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 
 void
 cpu_probe_amdc1e(void)
 {
 
 	/*
 	 * Detect the presence of C1E capability mostly on latest
 	 * dual-cores (or future) k8 family.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
 		cpu_ident_amdc1e = 1;
 	}
 }
 
 #if defined(__i386__) && defined(PC98)
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
 #else
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 #endif
 
 void
 cpu_idle(int busy)
 {
 	uint64_t msr;
 	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #ifdef MP_WATCHDOG
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		sbt = cpu_idleclock();
 	}
 
 	/* Apply AMD APIC timer C1E workaround. */
 	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 	}
 
 	/* Call main idle method. */
 	cpu_idle_fn(sbt);
 
 	/* Switch timers back into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 out:
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct pcpu *pcpu;
 	int *state;
 
 	pcpu = pcpu_find(cpu);
 	state = (int *)pcpu->pc_monitorbuf;
 	/*
 	 * This doesn't need to be atomic since missing the race will
 	 * simply result in unnecessary IPIs.
 	 */
 	if (*state == STATE_SLEEPING)
 		return (0);
 	if (*state == STATE_MWAIT)
 		*state = STATE_RUNNING;
 	return (1);
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 struct {
 	void	*id_fn;
 	char	*id_name;
 } idle_tbl[] = {
 	{ cpu_idle_spin, "spin" },
 	{ cpu_idle_mwait, "mwait" },
 	{ cpu_idle_hlt, "hlt" },
 #if !defined(__i386__) || !defined(PC98)
 	{ cpu_idle_acpi, "acpi" },
 #endif
 	{ NULL, NULL }
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 #if !defined(__i386__) || !defined(PC98)
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 #endif
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 
 static int
 idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int error;
 	char *p;
 	int i;
 
 	p = "unknown";
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 #if !defined(__i386__) || !defined(PC98)
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 #endif
 		if (strcmp(idle_tbl[i].id_name, buf))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
     idle_sysctl, "A", "currently selected idle function");
 
+static int panic_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
+    &panic_on_nmi, 0,
+    "Panic on NMI");
 int nmi_is_broadcast = 1;
 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
     &nmi_is_broadcast, 0,
     "Chipset NMI is broadcast");
 #ifdef KDB
 int kdb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
     &kdb_on_nmi, 0,
     "Go to KDB on NMI");
 #endif
 
 #ifdef DEV_ISA
-bool
-nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame, bool do_panic)
+void
+nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
 {
 
 	/* machine/parity/power fail/"kitchen sink" faults */
 	if (isa_nmi(frame->tf_err) == 0) {
 #ifdef KDB
 		/*
 		 * NMI can be hooked up to a pushbutton for debugging.
 		 */
 		if (kdb_on_nmi) {
 			printf ("NMI/cpu%d ... going to debugger\n", cpu);
 			kdb_trap(type, 0, frame);
-			return (true);
 		}
-	} else
 #endif /* KDB */
-	if (do_panic)
+	} else if (panic_on_nmi) {
 		panic("NMI indicates hardware failure");
-	return (false);
+	}
 }
 #endif
 
-int
-nmi_handle_intr(u_int type, struct trapframe *frame, bool panic)
+void
+nmi_handle_intr(u_int type, struct trapframe *frame)
 {
 
 #ifdef DEV_ISA
 #ifdef SMP
-	if (nmi_is_broadcast)
-		return (nmi_call_kdb_smp(type, frame, panic));
-	else
+	if (nmi_is_broadcast) {
+		nmi_call_kdb_smp(type, frame);
+		return;
+	}
 #endif
-		return (nmi_call_kdb(0, type, frame, panic));
+	nmi_call_kdb(0, type, frame);
 #endif
 }
Index: user/alc/PQ_LAUNDRY/sys/x86/x86/mp_x86.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/x86/x86/mp_x86.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/sys/x86/x86/mp_x86.c	(revision 307896)
@@ -1,1589 +1,1587 @@
 /*-
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
 #include "opt_cpu.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/cpu.h>
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
 #define WARMBOOT_SEG		(KERNBASE + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 /* lock region used by kernel profiling */
 int	mcount_lock;
 
 int	mp_naps;		/* # of Applications processors */
 int	boot_cpu_id = -1;	/* designated BSP */
 
 extern	struct pcpu __pcpu[];
 
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 int bootAP;
 
 /* Free these after use */
 void *bootstacks[MAXCPU];
 void *dpcpu;
 
 struct pcb stoppcbs[MAXCPU];
 struct susppcb **susppcbs;
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
 static u_long *ipi_preempt_counts[MAXCPU];
 static u_long *ipi_ast_counts[MAXCPU];
 u_long *ipi_invltlb_counts[MAXCPU];
 u_long *ipi_invlrng_counts[MAXCPU];
 u_long *ipi_invlpg_counts[MAXCPU];
 u_long *ipi_invlcache_counts[MAXCPU];
 u_long *ipi_rendezvous_counts[MAXCPU];
 static u_long *ipi_hardclock_counts[MAXCPU];
 #endif
 
 /* Default cpu_ops implementation. */
 struct cpu_ops cpu_ops;
 
 /*
  * Local data and functions.
  */
 
 static volatile cpuset_t ipi_stop_nmi_pending;
 
 /* used to hold the AP's until we are ready to release them */
 struct mtx ap_boot_mtx;
 
 /* Set to 1 once we're ready to let the APs out of the pen. */
 volatile int aps_ready = 0;
 
 /*
  * Store data from cpu_add() until later in the boot when we actually setup
  * the APs.
  */
 struct cpu_info cpu_info[MAX_APIC_ID + 1];
 int apic_cpuids[MAX_APIC_ID + 1];
 int cpu_apic_ids[MAXCPU];
 
 /* Holds pending bitmap based IPIs per CPU */
 volatile u_int cpu_ipi_pending[MAXCPU];
 
 static void	release_aps(void *dummy);
 static void	cpustop_handler_post(u_int cpu);
 
 static int	hyperthreading_allowed = 1;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
 	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
 
 static struct topo_node topo_root;
 
 static int pkg_id_shift;
 static int core_id_shift;
 static int disabled_cpus;
 
 struct cache_info {
 	int	id_shift;
 	int	present;
 } static caches[MAX_CACHE_LEVELS];
 
 void
 mem_range_AP_init(void)
 {
 
 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
 /*
  * Round up to the next power of two, if necessary, and then
  * take log2.
  * Returns -1 if argument is zero.
  */
 static __inline int
 mask_width(u_int x)
 {
 
 	return (fls(x << (1 - powerof2(x))) - 1);
 }
 
 /*
  * Add a cache level to the cache topology description.
  */
 static int
 add_deterministic_cache(int type, int level, int share_count)
 {
 
 	if (type == 0)
 		return (0);
 	if (type > 3) {
 		printf("unexpected cache type %d\n", type);
 		return (1);
 	}
 	if (type == 2) /* ignore instruction cache */
 		return (1);
 	if (level == 0 || level > MAX_CACHE_LEVELS) {
 		printf("unexpected cache level %d\n", type);
 		return (1);
 	}
 
 	if (caches[level - 1].present) {
 		printf("WARNING: multiple entries for L%u data cache\n", level);
 		printf("%u => %u\n", caches[level - 1].id_shift,
 		    mask_width(share_count));
 	}
 	caches[level - 1].id_shift = mask_width(share_count);
 	caches[level - 1].present = 1;
 
 	if (caches[level - 1].id_shift > pkg_id_shift) {
 		printf("WARNING: L%u data cache covers more "
 		    "APIC IDs than a package\n", level);
 		printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
 		caches[level - 1].id_shift = pkg_id_shift;
 	}
 	if (caches[level - 1].id_shift < core_id_shift) {
 		printf("WARNING: L%u data cache covers less "
 		    "APIC IDs than a core\n", level);
 		printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
 		caches[level - 1].id_shift = core_id_shift;
 	}
 
 	return (1);
 }
 
 /*
  * Determine topology of processing units and caches for AMD CPUs.
  * See:
  *  - AMD CPUID Specification (Publication # 25481)
  *  - BKDG For AMD Family 10h Processors (Publication # 31116), section 2.15
  *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
  * XXX At the moment the code does not recognize grouping of AMD CMT threads,
  * if supported, into cores, so each thread is treated as being in its own
  * core.  In other words, each logical CPU is considered to be a core.
  */
 static void
 topo_probe_amd(void)
 {
 	u_int p[4];
 	int level;
 	int share_count;
 	int type;
 	int i;
 
 	/* No multi-core capability. */
 	if ((amd_feature2 & AMDID2_CMP) == 0)
 		return;
 
 	/* For families 10h and newer. */
 	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
 	    AMDID_COREID_SIZE_SHIFT;
 
 	/* For 0Fh family. */
 	if (pkg_id_shift == 0)
 		pkg_id_shift =
 		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
 
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
 		for (i = 0; ; i++) {
 			cpuid_count(0x8000001d, i, p);
 			type = p[0] & 0x1f;
 			level = (p[0] >> 5) & 0x7;
 			share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 			if (!add_deterministic_cache(type, level, share_count))
 				break;
 		}
 	} else {
 		if (cpu_exthigh >= 0x80000005) {
 			cpuid_count(0x80000005, 0, p);
 			if (((p[2] >> 24) & 0xff) != 0) {
 				caches[0].id_shift = 0;
 				caches[0].present = 1;
 			}
 		}
 		if (cpu_exthigh >= 0x80000006) {
 			cpuid_count(0x80000006, 0, p);
 			if (((p[2] >> 16) & 0xffff) != 0) {
 				caches[1].id_shift = 0;
 				caches[1].present = 1;
 			}
 			if (((p[3] >> 18) & 0x3fff) != 0) {
 
 				/*
 				 * TODO: Account for dual-node processors
 				 * where each node within a package has its own
 				 * L3 cache.
 				 */
 				caches[2].id_shift = pkg_id_shift;
 				caches[2].present = 1;
 			}
 		}
 	}
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 1 and Leaf 4, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0x4(void)
 {
 	u_int p[4];
 	int max_cores;
 	int max_logical;
 
 	/* Both zero and one here mean one logical processor per package. */
 	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
 	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
 	if (max_logical <= 1)
 		return;
 
 	if (cpu_high >= 0x4) {
 		cpuid_count(0x04, 0, p);
 		max_cores = ((p[0] >> 26) & 0x3f) + 1;
 	} else
 		max_cores = 1;
 
 	core_id_shift = mask_width(max_logical/max_cores);
 	KASSERT(core_id_shift >= 0,
 	    ("intel topo: max_cores > max_logical\n"));
 	pkg_id_shift = core_id_shift + mask_width(max_cores);
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 11, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0xb(void)
 {
 	u_int p[4];
 	int bits;
 	int type;
 	int i;
 
 	/* Fall back if CPU leaf 11 doesn't really exist. */
 	cpuid_count(0x0b, 0, p);
 	if (p[1] == 0) {
 		topo_probe_intel_0x4();
 		return;
 	}
 
 	/* We only support three levels for now. */
 	for (i = 0; ; i++) {
 		cpuid_count(0x0b, i, p);
 
 		bits = p[0] & 0x1f;
 		type = (p[2] >> 8) & 0xff;
 
 		if (type == 0)
 			break;
 
 		/* TODO: check for duplicate (re-)assignment */
 		if (type == CPUID_TYPE_SMT)
 			core_id_shift = bits;
 		else if (type == CPUID_TYPE_CORE)
 			pkg_id_shift = bits;
 		else
 			printf("unknown CPU level type %d\n", type);
 	}
 
 	if (pkg_id_shift < core_id_shift) {
 		printf("WARNING: core covers more APIC IDs than a package\n");
 		core_id_shift = pkg_id_shift;
 	}
 }
 
 /*
  * Determine topology of caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 Architectures Software Developer’s Manual
  *    Volume 2A: Instruction Set Reference, A-M,
  *    CPUID instruction
  */
 static void
 topo_probe_intel_caches(void)
 {
 	u_int p[4];
 	int level;
 	int share_count;
 	int type;
 	int i;
 
 	if (cpu_high < 0x4) {
 		/*
 		 * Available cache level and sizes can be determined
 		 * via CPUID leaf 2, but that requires a huge table of hardcoded
 		 * values, so for now just assume L1 and L2 caches potentially
 		 * shared only by HTT processing units, if HTT is present.
 		 */
 		caches[0].id_shift = pkg_id_shift;
 		caches[0].present = 1;
 		caches[1].id_shift = pkg_id_shift;
 		caches[1].present = 1;
 		return;
 	}
 
 	for (i = 0; ; i++) {
 		cpuid_count(0x4, i, p);
 		type = p[0] & 0x1f;
 		level = (p[0] >> 5) & 0x7;
 		share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 		if (!add_deterministic_cache(type, level, share_count))
 			break;
 	}
 }
 
 /*
  * Determine topology of processing units and caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  */
 static void
 topo_probe_intel(void)
 {
 
 	/*
 	 * Note that 0x1 <= cpu_high < 4 case should be
 	 * compatible with topo_probe_intel_0x4() logic when
 	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
 	 * or it should trigger the fallback otherwise.
 	 */
 	if (cpu_high >= 0xb)
 		topo_probe_intel_0xb();
 	else if (cpu_high >= 0x1)
 		topo_probe_intel_0x4();
 
 	topo_probe_intel_caches();
 }
 
 /*
  * Topology information is queried only on BSP, on which this
  * code runs and for which it can query CPUID information.
  * Then topology is extrapolated on all packages using an
  * assumption that APIC ID to hardware component ID mapping is
  * homogenious.
  * That doesn't necesserily imply that the topology is uniform.
  */
 void
 topo_probe(void)
 {
 	static int cpu_topo_probed = 0;
 	struct x86_topo_layer {
 		int type;
 		int subtype;
 		int id_shift;
 	} topo_layers[MAX_CACHE_LEVELS + 3];
 	struct topo_node *parent;
 	struct topo_node *node;
 	int layer;
 	int nlayers;
 	int node_id;
 	int i;
 
 	if (cpu_topo_probed)
 		return;
 
 	CPU_ZERO(&logical_cpus_mask);
 
 	if (mp_ncpus <= 1)
 		; /* nothing */
 	else if (cpu_vendor_id == CPU_VENDOR_AMD)
 		topo_probe_amd();
 	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
 		topo_probe_intel();
 
 	KASSERT(pkg_id_shift >= core_id_shift,
 	    ("bug in APIC topology discovery"));
 
 	nlayers = 0;
 	bzero(topo_layers, sizeof(topo_layers));
 
 	topo_layers[nlayers].type = TOPO_TYPE_PKG;
 	topo_layers[nlayers].id_shift = pkg_id_shift;
 	if (bootverbose)
 		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
 	nlayers++;
 
 	/*
 	 * Consider all caches to be within a package/chip
 	 * and "in front" of all sub-components like
 	 * cores and hardware threads.
 	 */
 	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
 		if (caches[i].present) {
 			KASSERT(caches[i].id_shift <= pkg_id_shift,
 				("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift >= core_id_shift,
 				("bug in APIC topology discovery"));
 
 			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
 			topo_layers[nlayers].subtype = i + 1;
 			topo_layers[nlayers].id_shift = caches[i].id_shift;
 			if (bootverbose)
 				printf("L%u cache ID shift: %u\n",
 				    topo_layers[nlayers].subtype,
 				    topo_layers[nlayers].id_shift);
 			nlayers++;
 		}
 	}
 
 	if (pkg_id_shift > core_id_shift) {
 		topo_layers[nlayers].type = TOPO_TYPE_CORE;
 		topo_layers[nlayers].id_shift = core_id_shift;
 		if (bootverbose)
 			printf("Core ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	topo_layers[nlayers].type = TOPO_TYPE_PU;
 	topo_layers[nlayers].id_shift = 0;
 	nlayers++;
 
 	topo_init_root(&topo_root);
 	for (i = 0; i <= MAX_APIC_ID; ++i) {
 		if (!cpu_info[i].cpu_present)
 			continue;
 
 		parent = &topo_root;
 		for (layer = 0; layer < nlayers; ++layer) {
 			node_id = i >> topo_layers[layer].id_shift;
 			parent = topo_add_node_by_hwid(parent, node_id,
 			    topo_layers[layer].type,
 			    topo_layers[layer].subtype);
 		}
 	}
 
 	parent = &topo_root;
 	for (layer = 0; layer < nlayers; ++layer) {
 		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
 		node = topo_find_node_by_hwid(parent, node_id,
 		    topo_layers[layer].type,
 		    topo_layers[layer].subtype);
 		topo_promote_child(node);
 		parent = node;
 	}
 
 	cpu_topo_probed = 1;
 }
 
 /*
  * Assign logical CPU IDs to local APICs.
  */
 void
 assign_cpu_ids(void)
 {
 	struct topo_node *node;
 	u_int smt_mask;
 
 	smt_mask = (1u << core_id_shift) - 1;
 
 	/*
 	 * Assign CPU IDs to local APIC IDs and disable any CPUs
 	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
 	 */
 	mp_ncpus = 0;
 	TOPO_FOREACH(node, &topo_root) {
 		if (node->type != TOPO_TYPE_PU)
 			continue;
 
 		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
 			cpu_info[node->hwid].cpu_hyperthread = 1;
 
 		if (resource_disabled("lapic", node->hwid)) {
 			if (node->hwid != boot_cpu_id)
 				cpu_info[node->hwid].cpu_disabled = 1;
 			else
 				printf("Cannot disable BSP, APIC ID = %d\n",
 				    node->hwid);
 		}
 
 		if (!hyperthreading_allowed &&
 		    cpu_info[node->hwid].cpu_hyperthread)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (mp_ncpus >= MAXCPU)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (cpu_info[node->hwid].cpu_disabled) {
 			disabled_cpus++;
 			continue;
 		}
 
 		cpu_apic_ids[mp_ncpus] = node->hwid;
 		apic_cpuids[node->hwid] = mp_ncpus;
 		topo_set_pu_id(node, mp_ncpus);
 		mp_ncpus++;
 	}
 
 	KASSERT(mp_maxid >= mp_ncpus - 1,
 	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
 	    mp_ncpus));
 }
 
 /*
  * Print various information about the SMP system hardware and setup.
  */
 void
 cpu_mp_announce(void)
 {
 	struct topo_node *node;
 	const char *hyperthread;
 	int pkg_count;
 	int cores_per_pkg;
 	int thrs_per_core;
 
 	printf("FreeBSD/SMP: ");
 	if (topo_analyze(&topo_root, 1, &pkg_count,
 	    &cores_per_pkg, &thrs_per_core)) {
 		printf("%d package(s)", pkg_count);
 		if (cores_per_pkg > 0)
 			printf(" x %d core(s)", cores_per_pkg);
 		if (thrs_per_core > 1)
 		    printf(" x %d hardware threads", thrs_per_core);
 	} else {
 		printf("Non-uniform topology");
 	}
 	printf("\n");
 
 	if (disabled_cpus) {
 		printf("FreeBSD/SMP Online: ");
 		if (topo_analyze(&topo_root, 0, &pkg_count,
 		    &cores_per_pkg, &thrs_per_core)) {
 			printf("%d package(s)", pkg_count);
 			if (cores_per_pkg > 0)
 				printf(" x %d core(s)", cores_per_pkg);
 			if (thrs_per_core > 1)
 			    printf(" x %d hardware threads", thrs_per_core);
 		} else {
 			printf("Non-uniform topology");
 		}
 		printf("\n");
 	}
 
 	if (!bootverbose)
 		return;
 
 	TOPO_FOREACH(node, &topo_root) {
 		switch (node->type) {
 		case TOPO_TYPE_PKG:
 			printf("Package HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_CORE:
 			printf("\tCore HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_PU:
 			if (cpu_info[node->hwid].cpu_hyperthread)
 				hyperthread = "/HT";
 			else
 				hyperthread = "";
 
 			if (node->subtype == 0)
 				printf("\t\tCPU (AP%s): APIC ID: %u"
 				    "(disabled)\n", hyperthread, node->hwid);
 			else if (node->id == 0)
 				printf("\t\tCPU0 (BSP): APIC ID: %u\n",
 				    node->hwid);
 			else
 				printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
 				    node->id, hyperthread, node->hwid);
 			break;
 		default:
 			/* ignored */
 			break;
 		}
 	}
 }
 
 /*
  * Add a scheduling group, a group of logical processors sharing
  * a particular cache (and, thus having an affinity), to the scheduling
  * topology.
  * This function recursively works on lower level caches.
  */
 static void
 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
 {
 	struct topo_node *node;
 	int nchildren;
 	int ncores;
 	int i;
 
 	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
 	    ("x86topo_add_sched_group: bad type: %u", root->type));
 	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
 	cg_root->cg_count = root->cpu_count;
 	if (root->type == TOPO_TYPE_SYSTEM)
 		cg_root->cg_level = CG_SHARE_NONE;
 	else
 		cg_root->cg_level = root->subtype;
 
 	/*
 	 * Check how many core nodes we have under the given root node.
 	 * If we have multiple logical processors, but not multiple
 	 * cores, then those processors must be hardware threads.
 	 */
 	ncores = 0;
 	node = root;
 	while (node != NULL) {
 		if (node->type != TOPO_TYPE_CORE) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 
 		ncores++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	if (cg_root->cg_level != CG_SHARE_NONE &&
 	    root->cpu_count > 1 && ncores < 2)
 		cg_root->cg_flags = CG_FLAG_SMT;
 
 	/*
 	 * Find out how many cache nodes we have under the given root node.
 	 * We ignore cache nodes that cover all the same processors as the
 	 * root node.  Also, we do not descend below found cache nodes.
 	 * That is, we count top-level "non-redundant" caches under the root
 	 * node.
 	 */
 	nchildren = 0;
 	node = root;
 	while (node != NULL) {
 		if (node->type != TOPO_TYPE_CACHE ||
 		    (root->type != TOPO_TYPE_SYSTEM &&
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		nchildren++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	cg_root->cg_child = smp_topo_alloc(nchildren);
 	cg_root->cg_children = nchildren;
 
 	/*
 	 * Now find again the same cache nodes as above and recursively
 	 * build scheduling topologies for them.
 	 */
 	node = root;
 	i = 0;
 	while (node != NULL) {
 		if (node->type != TOPO_TYPE_CACHE ||
 		    (root->type != TOPO_TYPE_SYSTEM &&
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		cg_root->cg_child[i].cg_parent = cg_root;
 		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
 		i++;
 		node = topo_next_nonchild_node(root, node);
 	}
 }
 
 /*
  * Build the MI scheduling topology from the discovered hardware topology.
  */
 struct cpu_group *
 cpu_topo(void)
 {
 	struct cpu_group *cg_root;
 
 	if (mp_ncpus <= 1)
 		return (smp_topo_none());
 
 	cg_root = smp_topo_alloc(1);
 	x86topo_add_sched_group(&topo_root, cg_root);
 	return (cg_root);
 }
 
 
 /*
  * Add a logical CPU to the topology.
  */
 void
 cpu_add(u_int apic_id, char boot_cpu)
 {
 
 	if (apic_id > MAX_APIC_ID) {
 		panic("SMP: APIC ID %d too high", apic_id);
 		return;
 	}
 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
 	    apic_id));
 	cpu_info[apic_id].cpu_present = 1;
 	if (boot_cpu) {
 		KASSERT(boot_cpu_id == -1,
 		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
 		    boot_cpu_id));
 		boot_cpu_id = apic_id;
 		cpu_info[apic_id].cpu_bsp = 1;
 	}
 	if (mp_ncpus < MAXCPU) {
 		mp_ncpus++;
 		mp_maxid = mp_ncpus - 1;
 	}
 	if (bootverbose)
 		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
 		    "AP");
 }
 
 void
 cpu_mp_setmaxid(void)
 {
 
 	/*
 	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
 	 * If there were no calls to cpu_add() assume this is a UP system.
 	 */
 	if (mp_ncpus == 0)
 		mp_ncpus = 1;
 }
 
 int
 cpu_mp_probe(void)
 {
 
 	/*
 	 * Always record BSP in CPU map so that the mbuf init code works
 	 * correctly.
 	 */
 	CPU_SETOF(0, &all_cpus);
 	return (mp_ncpus > 1);
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary_tail(void)
 {
 	u_int cpuid;
 
 	/*
 	 * On real hardware, switch to x2apic mode if possible.  Do it
 	 * after aps_ready was signalled, to avoid manipulating the
 	 * mode while BSP might still want to send some IPI to us
 	 * (second startup IPI is ignored on modern hardware etc).
 	 */
 	lapic_xapic_mode();
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
 
 	/* set up CPU registers and state */
 	cpu_setregs();
 
 	/* set up SSE/NX */
 	initializecpu();
 
 	/* set up FPU state on the AP */
 #ifdef __amd64__
 	fpuinit();
 #else
 	npxinit(false);
 #endif
 
 	if (cpu_ops.cpu_init)
 		cpu_ops.cpu_init();
 
 	/* A quick check from sanity claus */
 	cpuid = PCPU_GET(cpuid);
 	if (PCPU_GET(apic_id) != lapic_id()) {
 		printf("SMP: cpuid = %d\n", cpuid);
 		printf("SMP: actual apic_id = %d\n", lapic_id());
 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
 		panic("cpuid mismatch! boom!!");
 	}
 
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
 
 	mca_init();
 
 	mtx_lock_spin(&ap_boot_mtx);
 
 	/* Init local apic for irq's */
 	lapic_setup(1);
 
 	/* Set memory range attributes for this CPU to match the BSP */
 	mem_range_AP_init();
 
 	smp_cpus++;
 
 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
 	printf("SMP: AP CPU #%d Launched!\n", cpuid);
 
 	/* Determine if we are a logical CPU. */
 	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
 		CPU_SET(cpuid, &logical_cpus_mask);
 
 	if (bootverbose)
 		lapic_dump("AP");
 
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
 	}
 
 #ifdef __amd64__
 	/*
 	 * Enable global pages TLB extension
 	 * This also implicitly flushes the TLB 
 	 */
 	load_cr4(rcr4() | CR4_PGE);
 	if (pmap_pcid_enabled)
 		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 #endif
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
 	/* Wait until all the AP's are up. */
 	while (atomic_load_acq_int(&smp_started) == 0)
 		ia32_pause();
 
 #ifndef EARLY_AP_STARTUP
 	/* Start per-CPU event timers. */
 	cpu_initclocks_ap();
 #endif
 
 	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
 }
 
 /*******************************************************************
  * local functions and data
  */
 
 /*
  * We tell the I/O APIC code about all the CPUs we want to receive
  * interrupts.  If we don't want certain CPUs to receive IRQs we
  * can simply not tell the I/O APIC code about them in this function.
  * We also do not tell it about the BSP since it tells itself about
  * the BSP internally to work with UP kernels and on UP machines.
  */
 void
 set_interrupt_apic_ids(void)
 {
 	u_int i, apic_id;
 
 	for (i = 0; i < MAXCPU; i++) {
 		apic_id = cpu_apic_ids[i];
 		if (apic_id == -1)
 			continue;
 		if (cpu_info[apic_id].cpu_bsp)
 			continue;
 		if (cpu_info[apic_id].cpu_disabled)
 			continue;
 
 		/* Don't let hyperthreads service interrupts. */
 		if (cpu_info[apic_id].cpu_hyperthread)
 			continue;
 
 		intr_add_cpu(i);
 	}
 }
 
 
 #ifdef COUNT_XINVLTLB_HITS
 u_int xhits_gbl[MAXCPU];
 u_int xhits_pg[MAXCPU];
 u_int xhits_rng[MAXCPU];
 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
     sizeof(xhits_gbl), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
     sizeof(xhits_pg), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
     sizeof(xhits_rng), "IU", "");
 
 u_int ipi_global;
 u_int ipi_page;
 u_int ipi_range;
 u_int ipi_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
     0, "");
 #endif /* COUNT_XINVLTLB_HITS */
 
 /*
  * Init and startup IPI.
  */
 void
 ipi_startup(int apic_id, int vector)
 {
 
 	/*
 	 * This attempts to follow the algorithm described in the
 	 * Intel Multiprocessor Specification v1.4 in section B.4.
 	 * For each IPI, we allow the local APIC ~20us to deliver the
 	 * IPI.  If that times out, we panic.
 	 */
 
 	/*
 	 * first we do an INIT IPI: this INIT IPI might be run, resetting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
 	 * ignored.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
 	lapic_ipi_wait(100);
 
 	/* Explicitly deassert the INIT IPI. */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
 	    apic_id);
 
 	DELAY(10000);		/* wait ~10mS */
 
 	/*
 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
 	 * latched, (P5 bug) this 1st STARTUP would then terminate
 	 * immediately, and the previously started INIT IPI would continue. OR
 	 * the previous INIT IPI has already run. and this STARTUP IPI will
 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
 	 * will run.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver first STARTUP IPI to APIC %d",
 		    apic_id);
 	DELAY(200);		/* wait ~200uS */
 
 	/*
 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
 	 * recognized after hardware RESET or INIT IPI.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver second STARTUP IPI to APIC %d",
 		    apic_id);
 
 	DELAY(200);		/* wait ~200uS */
 }
 
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
 void
 ipi_send_cpu(int cpu, u_int ipi)
 {
 	u_int bitmap, old_pending, new_pending;
 
 	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
 
 	if (IPI_IS_BITMAPED(ipi)) {
 		bitmap = 1 << ipi;
 		ipi = IPI_BITMAP_VECTOR;
 		do {
 			old_pending = cpu_ipi_pending[cpu];
 			new_pending = old_pending | bitmap;
 		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
 		    old_pending, new_pending));	
 		if (old_pending)
 			return;
 	}
 	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
 }
 
 void
 ipi_bitmap_handler(struct trapframe frame)
 {
 	struct trapframe *oldframe;
 	struct thread *td;
 	int cpu = PCPU_GET(cpuid);
 	u_int ipi_bitmap;
 
 	critical_enter();
 	td = curthread;
 	td->td_intr_nesting_level++;
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = &frame;
 	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
 #ifdef COUNT_IPIS
 		(*ipi_preempt_counts[cpu])++;
 #endif
 		sched_preempt(td);
 	}
 	if (ipi_bitmap & (1 << IPI_AST)) {
 #ifdef COUNT_IPIS
 		(*ipi_ast_counts[cpu])++;
 #endif
 		/* Nothing to do for AST */
 	}
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
 #ifdef COUNT_IPIS
 		(*ipi_hardclock_counts[cpu])++;
 #endif
 		hardclockintr();
 	}
 	td->td_intr_frame = oldframe;
 	td->td_intr_nesting_level--;
 	critical_exit();
 }
 
 /*
  * send an IPI to a set of cpus.
  */
 void
 ipi_selected(cpuset_t cpus, u_int ipi)
 {
 	int cpu;
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
 
 	while ((cpu = CPU_FFS(&cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &cpus);
 		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 		ipi_send_cpu(cpu, ipi);
 	}
 }
 
 /*
  * send an IPI to a specific CPU.
  */
 void
 ipi_cpu(int cpu, u_int ipi)
 {
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
 
 	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 	ipi_send_cpu(cpu, ipi);
 }
 
 /*
  * send an IPI to all CPUs EXCEPT myself
  */
 void
 ipi_all_but_self(u_int ipi)
 {
 	cpuset_t other_cpus;
 
 	other_cpus = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	if (IPI_IS_BITMAPED(ipi)) {
 		ipi_selected(other_cpus, ipi);
 		return;
 	}
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
 
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 int
 ipi_nmi_handler(void)
 {
 	u_int cpuid;
 
 	/*
 	 * As long as there is not a simple way to know about a NMI's
 	 * source, if the bitmask for the current CPU is present in
 	 * the global pending bitword an IPI_STOP_HARD has been issued
 	 * and should be handled.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
 		return (1);
 
 	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
 	cpustop_handler();
 	return (0);
 }
 
 #ifdef DEV_ISA
 int nmi_kdb_lock;
 
-bool
-nmi_call_kdb_smp(u_int type, struct trapframe *frame, bool do_panic)
+void
+nmi_call_kdb_smp(u_int type, struct trapframe *frame)
 {
 	int cpu;
-	bool call_post, ret;
+	bool call_post;
 
 	cpu = PCPU_GET(cpuid);
 	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
-		ret = nmi_call_kdb(cpu, type, frame, do_panic);
+		nmi_call_kdb(cpu, type, frame);
 		call_post = false;
 	} else {
-		ret = true;
 		savectx(&stoppcbs[cpu]);
 		CPU_SET_ATOMIC(cpu, &stopped_cpus);
 		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
 			ia32_pause();
 		call_post = true;
 	}
 	atomic_store_rel_int(&nmi_kdb_lock, 0);
 	if (call_post)
 		cpustop_handler_post(cpu);
-	return (ret);
 }
 #endif
 
 /*
  * Handle an IPI_STOP by saving our current context and spinning until we
  * are resumed.
  */
 void
 cpustop_handler(void)
 {
 	u_int cpu;
 
 	cpu = PCPU_GET(cpuid);
 
 	savectx(&stoppcbs[cpu]);
 
 	/* Indicate that we are stopped */
 	CPU_SET_ATOMIC(cpu, &stopped_cpus);
 
 	/* Wait for restart */
 	while (!CPU_ISSET(cpu, &started_cpus))
 	    ia32_pause();
 
 	cpustop_handler_post(cpu);
 }
 
 static void
 cpustop_handler_post(u_int cpu)
 {
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
 	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
 #if defined(__amd64__) && defined(DDB)
 	amd64_db_resume_dbreg();
 #endif
 
 	if (cpu == 0 && cpustop_restartfunc != NULL) {
 		cpustop_restartfunc();
 		cpustop_restartfunc = NULL;
 	}
 }
 
 /*
  * Handle an IPI_SUSPEND by saving our current context and spinning until we
  * are resumed.
  */
 void
 cpususpend_handler(void)
 {
 	u_int cpu;
 
 	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
 
 	cpu = PCPU_GET(cpuid);
 	if (savectx(&susppcbs[cpu]->sp_pcb)) {
 #ifdef __amd64__
 		fpususpend(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxsuspend(susppcbs[cpu]->sp_fpususpend);
 #endif
 		wbinvd();
 		CPU_SET_ATOMIC(cpu, &suspended_cpus);
 	} else {
 #ifdef __amd64__
 		fpuresume(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxresume(susppcbs[cpu]->sp_fpususpend);
 #endif
 		pmap_init_pat();
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
 
 		/* Indicate that we are resumed */
 		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
 	/* Wait for resume */
 	while (!CPU_ISSET(cpu, &started_cpus))
 		ia32_pause();
 
 	if (cpu_ops.cpu_resume)
 		cpu_ops.cpu_resume();
 #ifdef __amd64__
 	if (vmm_resume_p)
 		vmm_resume_p();
 #endif
 
 	/* Resume MCA and local APIC */
 	lapic_xapic_mode();
 	mca_resume();
 	lapic_setup(0);
 
 	/* Indicate that we are resumed */
 	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
 }
 
 
 void
 invlcache_handler(void)
 {
 	uint32_t generation;
 
 #ifdef COUNT_IPIS
 	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	/*
 	 * Reading the generation here allows greater parallelism
 	 * since wbinvd is a serializing instruction.  Without the
 	 * temporary, we'd wait for wbinvd to complete, then the read
-	 * would execute, then the dependent write, whuch must then
+	 * would execute, then the dependent write, which must then
 	 * complete before return from interrupt.
 	 */
 	generation = smp_tlb_generation;
 	wbinvd();
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 /*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
 static void
 release_aps(void *dummy __unused)
 {
 
 	if (mp_ncpus == 1) 
 		return;
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
 #ifdef COUNT_IPIS
 /*
  * Setup interrupt counters for IPI handlers.
  */
 static void
 mp_ipi_intrcnt(void *dummy)
 {
 	char buf[64];
 	int i;
 
 	CPU_FOREACH(i) {
 		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
 		intrcnt_add(buf, &ipi_invlcache_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
 		intrcnt_add(buf, &ipi_preempt_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
 		intrcnt_add(buf, &ipi_ast_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
 		intrcnt_add(buf, &ipi_hardclock_counts[i]);
 	}		
 }
 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 #endif
 
 /*
  * Flush the TLB on other CPU's
  */
 
 /* Variables needed for SMP tlb shootdown. */
 static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 pmap_t smp_tlb_pmap;
 volatile uint32_t smp_tlb_generation;
 
 #ifdef __amd64__
 #define	read_eflags() read_rflags()
 #endif
 
 static void
 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
     vm_offset_t addr1, vm_offset_t addr2)
 {
 	cpuset_t other_cpus;
 	volatile uint32_t *p_cpudone;
 	uint32_t generation;
 	int cpu;
 
 	/*
 	 * Check for other cpus.  Return if none.
 	 */
 	if (CPU_ISFULLSET(&mask)) {
 		if (mp_ncpus <= 1)
 			return;
 	} else {
 		CPU_CLR(PCPU_GET(cpuid), &mask);
 		if (CPU_EMPTY(&mask))
 			return;
 	}
 
 	if (!(read_eflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	smp_tlb_pmap = pmap;
 	generation = ++smp_tlb_generation;
 	if (CPU_ISFULLSET(&mask)) {
 		ipi_all_but_self(vector);
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	} else {
 		other_cpus = mask;
 		while ((cpu = CPU_FFS(&mask)) != 0) {
 			cpu--;
 			CPU_CLR(cpu, &mask);
 			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
 			    cpu, vector);
 			ipi_send_cpu(cpu, vector);
 		}
 	}
 	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &other_cpus);
 		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
 		while (*p_cpudone != generation)
 			ia32_pause();
 	}
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
 
 void
 smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_global++;
 #endif
 	}
 }
 
 void
 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_page++;
 #endif
 	}
 }
 
 void
 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL,
 		    addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_range++;
 		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 	}
 }
 
 void
 smp_cache_flush(void)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
 		    0, 0);
 	}
 }
 
 /*
  * Handlers for TLB related IPIs
  */
 void
 invltlb_handler(void)
 {
 	uint32_t generation;
   
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	/*
 	 * Reading the generation here allows greater parallelism
 	 * since invalidating the TLB is a serializing operation.
 	 */
 	generation = smp_tlb_generation;
 	if (smp_tlb_pmap == kernel_pmap)
 		invltlb_glob();
 	else
 		invltlb();
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlpg_handler(void)
 {
 	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	generation = smp_tlb_generation;	/* Overlap with serialization */
 	invlpg(smp_tlb_addr1);
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlrng_handler(void)
 {
 	vm_offset_t addr, addr2;
 	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	generation = smp_tlb_generation;	/* Overlap with serialization */
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 
 	PCPU_SET(smp_tlb_done, generation);
 }
Index: user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.8
===================================================================
--- user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.8	(revision 307895)
+++ user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.8	(revision 307896)
@@ -1,198 +1,198 @@
 .\" Copyright (c) 1989, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 4. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     @(#)mountd.8	8.4 (Berkeley) 4/28/95
 .\" $FreeBSD$
 .\"
-.Dd October 14, 2012
+.Dd October 24, 2016
 .Dt MOUNTD 8
 .Os
 .Sh NAME
 .Nm mountd
 .Nd service remote
 .Tn NFS
 mount requests
 .Sh SYNOPSIS
 .Nm
 .Op Fl 2delnrS
 .Op Fl h Ar bindip
 .Op Fl p Ar port
 .Op Ar exportsfile ...
 .Sh DESCRIPTION
 The
 .Nm
 utility is the server for
 .Tn NFS
 mount requests from other client machines.
 It listens for service requests at the port indicated in the
 .Tn NFS
 server specification; see
 .%T "Network File System Protocol Specification" ,
 RFC1094, Appendix A and
 .%T "NFS: Network File System Version 3 Protocol Specification" ,
 Appendix I.
 .Pp
 The following options are available:
 .Bl -tag -width indent
 .It Fl 2
 Allow the administrator to force clients to use only the
 version 2
 .Tn NFS
 protocol to mount file systems from this server.
 .It Fl d
 Output debugging information.
 .Nm
 will not detach from the controlling terminal and will print
 debugging messages to stderr.
 .It Fl e
 Ignored; included for backward compatibility.
 .It Fl h Ar bindip
 Specify specific IP addresses to bind to for TCP and UDP requests.
 This option may be specified multiple times.
 If no
 .Fl h
 option is specified,
 .Nm
 will bind to
 .Dv INADDR_ANY .
 Note that when specifying IP addresses with
 .Fl h ,
 .Nm
 will automatically add
 .Li 127.0.0.1
 and if IPv6 is enabled,
 .Li ::1
 to the list.
 .It Fl l
 Cause all succeeded
 .Nm
 requests to be logged.
 .It Fl n
 Allow non-root mount requests to be served.
 This should only be specified if there are clients such as PC's,
 that require it.
-It will automatically clear the vfs.nfsrv.nfs_privport sysctl flag, which
+It will automatically clear the vfs.nfsd.nfs_privport sysctl flag, which
 controls if the kernel will accept NFS requests from reserved ports only.
 .It Fl p Ar port
 Force
 .Nm
 to bind to the specified port, for both
 .Dv AF_INET
 and
 .Dv AF_INET6
 address families.
 This is typically done to ensure that the port which
 .Nm
 binds to is a known quantity which can be used in firewall rulesets.
 If
 .Nm
 cannot bind to this port, an appropriate error will be recorded in
 the system log, and the daemon will then exit.
 .It Fl r
 Allow mount RPCs requests for regular files to be served.
 Although this seems to violate the mount protocol specification,
 some diskless workstations do mount requests for
 their swapfiles and expect them to be regular files.
 Since a regular file cannot be specified in
 .Pa /etc/exports ,
 the entire file system in which the swapfiles resides
 will have to be exported with the
 .Fl alldirs
 flag.
 .It Ar exportsfile
 Specify an alternate location
 for the exports file.
 More than one exports file can be specified.
 .It Fl S
 Tell mountd to suspend/resume execution of the nfsd threads whenever
 the exports list is being reloaded.
 This avoids intermittent access
 errors for clients that do NFS RPCs while the exports are being
 reloaded, but introduces a delay in RPC response while the reload
 is in progress.
 If
 .Nm
 crashes while an exports load is in progress,
 .Nm
 must be restarted to get the nfsd threads running again, if this
 option is used.
 .El
 .Pp
 When
 .Nm
 is started,
 it loads the export host addresses and options into the kernel
 using the
 .Xr mount 2
 system call.
 After changing the exports file,
 a hangup signal should be sent to the
 .Nm
 daemon
 to get it to reload the export information.
 After sending the SIGHUP
 (kill \-s HUP `cat /var/run/mountd.pid`),
 check the syslog output to see if
 .Nm
 logged any parsing
 errors in the exports file.
 .Pp
 If
 .Nm
 detects that the running kernel does not include
 .Tn NFS
 support, it will attempt to load a loadable kernel module containing
 .Tn NFS
 code, using
 .Xr kldload 2 .
 If this fails, or no
 .Tn NFS
 KLD was available,
 .Nm
 exits with an error.
 .Sh FILES
 .Bl -tag -width /var/run/mountd.pid -compact
 .It Pa /etc/exports
 the list of exported file systems
 .It Pa /var/run/mountd.pid
 the pid of the currently running mountd
 .It Pa /var/db/mountdtab
 the current list of remote mounted file systems
 .El
 .Sh SEE ALSO
 .Xr nfsstat 1 ,
 .Xr kldload 2 ,
 .Xr nfsv4 4 ,
 .Xr exports 5 ,
 .Xr nfsd 8 ,
 .Xr rpcbind 8 ,
 .Xr showmount 8
 .Sh HISTORY
 The
 .Nm
 utility first appeared in
 .Bx 4.4 .
Index: user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.c
===================================================================
--- user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.c	(revision 307895)
+++ user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.c	(revision 307896)
@@ -1,3290 +1,3290 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Herb Hasler and Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1989, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /*not lint*/
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)mountd.c	8.15 (Berkeley) 5/1/95";
 #endif /*not lint*/
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
 #include <sys/linker.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <rpc/rpc.h>
 #include <rpc/rpc_com.h>
 #include <rpc/pmap_clnt.h>
 #include <rpc/pmap_prot.h>
 #include <rpcsvc/mount.h>
 #include <nfs/nfsproto.h>
 #include <nfs/nfssvc.h>
 #include <nfsserver/nfs.h>
 
 #include <fs/nfs/nfsport.h>
 
 #include <arpa/inet.h>
 
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
 #include <grp.h>
 #include <libutil.h>
 #include <limits.h>
 #include <netdb.h>
 #include <pwd.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include "pathnames.h"
 #include "mntopts.h"
 
 #ifdef DEBUG
 #include <stdarg.h>
 #endif
 
 /*
  * Structures for keeping the mount list and export list
  */
 struct mountlist {
 	struct mountlist *ml_next;
 	char	ml_host[MNTNAMLEN+1];
 	char	ml_dirp[MNTPATHLEN+1];
 };
 
 struct dirlist {
 	struct dirlist	*dp_left;
 	struct dirlist	*dp_right;
 	int		dp_flag;
 	struct hostlist	*dp_hosts;	/* List of hosts this dir exported to */
 	char		dp_dirp[1];	/* Actually malloc'd to size of dir */
 };
 /* dp_flag bits */
 #define	DP_DEFSET	0x1
 #define DP_HOSTSET	0x2
 
 struct exportlist {
 	struct exportlist *ex_next;
 	struct dirlist	*ex_dirl;
 	struct dirlist	*ex_defdir;
 	int		ex_flag;
 	fsid_t		ex_fs;
 	char		*ex_fsdir;
 	char		*ex_indexfile;
 	int		ex_numsecflavors;
 	int		ex_secflavors[MAXSECFLAVORS];
 	int		ex_defnumsecflavors;
 	int		ex_defsecflavors[MAXSECFLAVORS];
 };
 /* ex_flag bits */
 #define	EX_LINKED	0x1
 
 struct netmsk {
 	struct sockaddr_storage nt_net;
 	struct sockaddr_storage nt_mask;
 	char		*nt_name;
 };
 
 union grouptypes {
 	struct addrinfo *gt_addrinfo;
 	struct netmsk	gt_net;
 };
 
 struct grouplist {
 	int gr_type;
 	union grouptypes gr_ptr;
 	struct grouplist *gr_next;
 	int gr_numsecflavors;
 	int gr_secflavors[MAXSECFLAVORS];
 };
 /* Group types */
 #define	GT_NULL		0x0
 #define	GT_HOST		0x1
 #define	GT_NET		0x2
 #define	GT_DEFAULT	0x3
 #define GT_IGNORE	0x5
 
 struct hostlist {
 	int		 ht_flag;	/* Uses DP_xx bits */
 	struct grouplist *ht_grp;
 	struct hostlist	 *ht_next;
 };
 
 struct fhreturn {
 	int	fhr_flag;
 	int	fhr_vers;
 	nfsfh_t	fhr_fh;
 	int	fhr_numsecflavors;
 	int	*fhr_secflavors;
 };
 
 #define	GETPORT_MAXTRY	20	/* Max tries to get a port # */
 
 /* Global defs */
 static char	*add_expdir(struct dirlist **, char *, int);
 static void	add_dlist(struct dirlist **, struct dirlist *,
 		    struct grouplist *, int, struct exportlist *);
 static void	add_mlist(char *, char *);
 static int	check_dirpath(char *);
 static int	check_options(struct dirlist *);
 static int	checkmask(struct sockaddr *sa);
 static int	chk_host(struct dirlist *, struct sockaddr *, int *, int *,
 		    int *, int **);
 static char	*strsep_quote(char **stringp, const char *delim);
 static int	create_service(struct netconfig *nconf);
 static void	complete_service(struct netconfig *nconf, char *port_str);
 static void	clearout_service(void);
 static void	del_mlist(char *hostp, char *dirp);
 static struct dirlist	*dirp_search(struct dirlist *, char *);
 static int	do_mount(struct exportlist *, struct grouplist *, int,
 		    struct xucred *, char *, int, struct statfs *);
 static int	do_opt(char **, char **, struct exportlist *,
 		    struct grouplist *, int *, int *, struct xucred *);
 static struct exportlist	*ex_search(fsid_t *);
 static struct exportlist	*get_exp(void);
 static void	free_dir(struct dirlist *);
 static void	free_exp(struct exportlist *);
 static void	free_grp(struct grouplist *);
 static void	free_host(struct hostlist *);
 static void	get_exportlist(void);
 static int	get_host(char *, struct grouplist *, struct grouplist *);
 static struct hostlist *get_ht(void);
 static int	get_line(void);
 static void	get_mountlist(void);
 static int	get_net(char *, struct netmsk *, int);
 static void	getexp_err(struct exportlist *, struct grouplist *);
 static struct grouplist	*get_grp(void);
 static void	hang_dirp(struct dirlist *, struct grouplist *,
 				struct exportlist *, int);
 static void	huphandler(int sig);
 static int	makemask(struct sockaddr_storage *ssp, int bitlen);
 static void	mntsrv(struct svc_req *, SVCXPRT *);
 static void	nextfield(char **, char **);
 static void	out_of_mem(void);
 static void	parsecred(char *, struct xucred *);
 static int	parsesec(char *, struct exportlist *);
 static int	put_exlist(struct dirlist *, XDR *, struct dirlist *,
 		    int *, int);
 static void	*sa_rawaddr(struct sockaddr *sa, int *nbytes);
 static int	sacmp(struct sockaddr *sa1, struct sockaddr *sa2,
 		    struct sockaddr *samask);
 static int	scan_tree(struct dirlist *, struct sockaddr *);
 static void	usage(void);
 static int	xdr_dir(XDR *, char *);
 static int	xdr_explist(XDR *, caddr_t);
 static int	xdr_explist_brief(XDR *, caddr_t);
 static int	xdr_explist_common(XDR *, caddr_t, int);
 static int	xdr_fhs(XDR *, caddr_t);
 static int	xdr_mlist(XDR *, caddr_t);
 static void	terminate(int);
 
 static struct exportlist *exphead;
 static struct mountlist *mlhead;
 static struct grouplist *grphead;
 static char *exnames_default[2] = { _PATH_EXPORTS, NULL };
 static char **exnames;
 static char **hosts = NULL;
 static struct xucred def_anon = {
 	XUCRED_VERSION,
 	(uid_t)-2,
 	1,
 	{ (gid_t)-2 },
 	NULL
 };
 static int force_v2 = 0;
 static int resvport_only = 1;
 static int nhosts = 0;
 static int dir_only = 1;
 static int dolog = 0;
 static int got_sighup = 0;
 static int xcreated = 0;
 
 static char *svcport_str = NULL;
 static int mallocd_svcport = 0;
 static int *sock_fd;
 static int sock_fdcnt;
 static int sock_fdpos;
 static int suspend_nfsd = 0;
 
 static int opt_flags;
 static int have_v6 = 1;
 
 static int v4root_phase = 0;
 static char v4root_dirpath[PATH_MAX + 1];
 static int has_publicfh = 0;
 
 static struct pidfh *pfh = NULL;
 /* Bits for opt_flags above */
 #define	OP_MAPROOT	0x01
 #define	OP_MAPALL	0x02
 /* 0x4 free */
 #define	OP_MASK		0x08
 #define	OP_NET		0x10
 #define	OP_ALLDIRS	0x40
 #define	OP_HAVEMASK	0x80	/* A mask was specified or inferred. */
 #define	OP_QUIET	0x100
 #define OP_MASKLEN	0x200
 #define OP_SEC		0x400
 
 #ifdef DEBUG
 static int debug = 1;
 static void	SYSLOG(int, const char *, ...) __printflike(2, 3);
 #define syslog SYSLOG
 #else
 static int debug = 0;
 #endif
 
 /*
  * Similar to strsep(), but it allows for quoted strings
  * and escaped characters.
  *
  * It returns the string (or NULL, if *stringp is NULL),
  * which is a de-quoted version of the string if necessary.
  *
  * It modifies *stringp in place.
  */
 static char *
 strsep_quote(char **stringp, const char *delim)
 {
 	char *srcptr, *dstptr, *retval;
 	char quot = 0;
 	
 	if (stringp == NULL || *stringp == NULL)
 		return (NULL);
 
 	srcptr = dstptr = retval = *stringp;
 
 	while (*srcptr) {
 		/*
 		 * We're looking for several edge cases here.
 		 * First:  if we're in quote state (quot != 0),
 		 * then we ignore the delim characters, but otherwise
 		 * process as normal, unless it is the quote character.
 		 * Second:  if the current character is a backslash,
 		 * we take the next character as-is, without checking
 		 * for delim, quote, or backslash.  Exception:  if the
 		 * next character is a NUL, that's the end of the string.
 		 * Third:  if the character is a quote character, we toggle
 		 * quote state.
 		 * Otherwise:  check the current character for NUL, or
 		 * being in delim, and end the string if either is true.
 		 */
 		if (*srcptr == '\\') {
 			srcptr++;
 			/*
 			 * The edge case here is if the next character
 			 * is NUL, we want to stop processing.  But if
 			 * it's not NUL, then we simply want to copy it.
 			 */
 			if (*srcptr) {
 				*dstptr++ = *srcptr++;
 			}
 			continue;
 		}
 		if (quot == 0 && (*srcptr == '\'' || *srcptr == '"')) {
 			quot = *srcptr++;
 			continue;
 		}
 		if (quot && *srcptr == quot) {
 			/* End of the quoted part */
 			quot = 0;
 			srcptr++;
 			continue;
 		}
 		if (!quot && strchr(delim, *srcptr))
 			break;
 		*dstptr++ = *srcptr++;
 	}
 
 	*dstptr = 0; /* Terminate the string */
 	*stringp = (*srcptr == '\0') ? NULL : srcptr + 1;
 	return (retval);
 }
 
 /*
  * Mountd server for NFS mount protocol as described in:
  * NFS: Network File System Protocol Specification, RFC1094, Appendix A
  * The optional arguments are the exports file name
  * default: _PATH_EXPORTS
  * and "-n" to allow nonroot mount.
  */
 int
 main(int argc, char **argv)
 {
 	fd_set readfds;
 	struct netconfig *nconf;
 	char *endptr, **hosts_bak;
 	void *nc_handle;
 	pid_t otherpid;
 	in_port_t svcport;
 	int c, k, s;
 	int maxrec = RPC_MAXDATASIZE;
 	int attempt_cnt, port_len, port_pos, ret;
 	char **port_list;
 
 	/* Check that another mountd isn't already running. */
 	pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &otherpid);
 	if (pfh == NULL) {
 		if (errno == EEXIST)
 			errx(1, "mountd already running, pid: %d.", otherpid);
 		warn("cannot open or create pidfile");
 	}
 
 	s = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
 	if (s < 0)
 		have_v6 = 0;
 	else
 		close(s);
 
 	while ((c = getopt(argc, argv, "2deh:lnp:rS")) != -1)
 		switch (c) {
 		case '2':
 			force_v2 = 1;
 			break;
 		case 'e':
 			/* now a no-op, since this is the default */
 			break;
 		case 'n':
 			resvport_only = 0;
 			break;
 		case 'r':
 			dir_only = 0;
 			break;
 		case 'd':
 			debug = debug ? 0 : 1;
 			break;
 		case 'l':
 			dolog = 1;
 			break;
 		case 'p':
 			endptr = NULL;
 			svcport = (in_port_t)strtoul(optarg, &endptr, 10);
 			if (endptr == NULL || *endptr != '\0' ||
 			    svcport == 0 || svcport >= IPPORT_MAX)
 				usage();
 			svcport_str = strdup(optarg);
 			break;
 		case 'h':
 			++nhosts;
 			hosts_bak = hosts;
 			hosts_bak = realloc(hosts, nhosts * sizeof(char *));
 			if (hosts_bak == NULL) {
 				if (hosts != NULL) {
 					for (k = 0; k < nhosts; k++) 
 						free(hosts[k]);
 					free(hosts);
 					out_of_mem();
 				}
 			}
 			hosts = hosts_bak;
 			hosts[nhosts - 1] = strdup(optarg);
 			if (hosts[nhosts - 1] == NULL) {
 				for (k = 0; k < (nhosts - 1); k++) 
 					free(hosts[k]);
 				free(hosts);
 				out_of_mem();
 			}
 			break;
 		case 'S':
 			suspend_nfsd = 1;
 			break;
 		default:
 			usage();
 		}
 
 	if (modfind("nfsd") < 0) {
 		/* Not present in kernel, try loading it */
 		if (kldload("nfsd") < 0 || modfind("nfsd") < 0)
 			errx(1, "NFS server is not available");
 	}
 
 	argc -= optind;
 	argv += optind;
 	grphead = (struct grouplist *)NULL;
 	exphead = (struct exportlist *)NULL;
 	mlhead = (struct mountlist *)NULL;
 	if (argc > 0)
 		exnames = argv;
 	else
 		exnames = exnames_default;
 	openlog("mountd", LOG_PID, LOG_DAEMON);
 	if (debug)
 		warnx("getting export list");
 	get_exportlist();
 	if (debug)
 		warnx("getting mount list");
 	get_mountlist();
 	if (debug)
 		warnx("here we go");
 	if (debug == 0) {
 		daemon(0, 0);
 		signal(SIGINT, SIG_IGN);
 		signal(SIGQUIT, SIG_IGN);
 	}
 	signal(SIGHUP, huphandler);
 	signal(SIGTERM, terminate);
 	signal(SIGPIPE, SIG_IGN);
 
 	pidfile_write(pfh);
 
 	rpcb_unset(MOUNTPROG, MOUNTVERS, NULL);
 	rpcb_unset(MOUNTPROG, MOUNTVERS3, NULL);
 	rpc_control(RPC_SVC_CONNMAXREC_SET, &maxrec);
 
 	if (!resvport_only) {
-		if (sysctlbyname("vfs.nfsrv.nfs_privport", NULL, NULL,
+		if (sysctlbyname("vfs.nfsd.nfs_privport", NULL, NULL,
 		    &resvport_only, sizeof(resvport_only)) != 0 &&
 		    errno != ENOENT) {
 			syslog(LOG_ERR, "sysctl: %m");
 			exit(1);
 		}
 	}
 
 	/*
 	 * If no hosts were specified, add a wildcard entry to bind to
 	 * INADDR_ANY. Otherwise make sure 127.0.0.1 and ::1 are added to the
 	 * list.
 	 */
 	if (nhosts == 0) {
 		hosts = malloc(sizeof(char *));
 		if (hosts == NULL)
 			out_of_mem();
 		hosts[0] = "*";
 		nhosts = 1;
 	} else {
 		hosts_bak = hosts;
 		if (have_v6) {
 			hosts_bak = realloc(hosts, (nhosts + 2) *
 			    sizeof(char *));
 			if (hosts_bak == NULL) {
 				for (k = 0; k < nhosts; k++)
 					free(hosts[k]);
 		    		free(hosts);
 		    		out_of_mem();
 			} else
 				hosts = hosts_bak;
 			nhosts += 2;
 			hosts[nhosts - 2] = "::1";
 		} else {
 			hosts_bak = realloc(hosts, (nhosts + 1) * sizeof(char *));
 			if (hosts_bak == NULL) {
 				for (k = 0; k < nhosts; k++)
 					free(hosts[k]);
 				free(hosts);
 				out_of_mem();
 			} else {
 				nhosts += 1;
 				hosts = hosts_bak;
 			}
 		}
 
 		hosts[nhosts - 1] = "127.0.0.1";
 	}
 
 	attempt_cnt = 1;
 	sock_fdcnt = 0;
 	sock_fd = NULL;
 	port_list = NULL;
 	port_len = 0;
 	nc_handle = setnetconfig();
 	while ((nconf = getnetconfig(nc_handle))) {
 		if (nconf->nc_flag & NC_VISIBLE) {
 			if (have_v6 == 0 && strcmp(nconf->nc_protofmly,
 			    "inet6") == 0) {
 				/* DO NOTHING */
 			} else {
 				ret = create_service(nconf);
 				if (ret == 1)
 					/* Ignore this call */
 					continue;
 				if (ret < 0) {
 					/*
 					 * Failed to bind port, so close off
 					 * all sockets created and try again
 					 * if the port# was dynamically
 					 * assigned via bind(2).
 					 */
 					clearout_service();
 					if (mallocd_svcport != 0 &&
 					    attempt_cnt < GETPORT_MAXTRY) {
 						free(svcport_str);
 						svcport_str = NULL;
 						mallocd_svcport = 0;
 					} else {
 						errno = EADDRINUSE;
 						syslog(LOG_ERR,
 						    "bindresvport_sa: %m");
 						exit(1);
 					}
 
 					/* Start over at the first service. */
 					free(sock_fd);
 					sock_fdcnt = 0;
 					sock_fd = NULL;
 					nc_handle = setnetconfig();
 					attempt_cnt++;
 				} else if (mallocd_svcport != 0 &&
 				    attempt_cnt == GETPORT_MAXTRY) {
 					/*
 					 * For the last attempt, allow
 					 * different port #s for each nconf
 					 * by saving the svcport_str and
 					 * setting it back to NULL.
 					 */
 					port_list = realloc(port_list,
 					    (port_len + 1) * sizeof(char *));
 					if (port_list == NULL)
 						out_of_mem();
 					port_list[port_len++] = svcport_str;
 					svcport_str = NULL;
 					mallocd_svcport = 0;
 				}
 			}
 		}
 	}
 
 	/*
 	 * Successfully bound the ports, so call complete_service() to
 	 * do the rest of the setup on the service(s).
 	 */
 	sock_fdpos = 0;
 	port_pos = 0;
 	nc_handle = setnetconfig();
 	while ((nconf = getnetconfig(nc_handle))) {
 		if (nconf->nc_flag & NC_VISIBLE) {
 			if (have_v6 == 0 && strcmp(nconf->nc_protofmly,
 			    "inet6") == 0) {
 				/* DO NOTHING */
 			} else if (port_list != NULL) {
 				if (port_pos >= port_len) {
 					syslog(LOG_ERR, "too many port#s");
 					exit(1);
 				}
 				complete_service(nconf, port_list[port_pos++]);
 			} else
 				complete_service(nconf, svcport_str);
 		}
 	}
 	endnetconfig(nc_handle);
 	free(sock_fd);
 	if (port_list != NULL) {
 		for (port_pos = 0; port_pos < port_len; port_pos++)
 			free(port_list[port_pos]);
 		free(port_list);
 	}
 
 	if (xcreated == 0) {
 		syslog(LOG_ERR, "could not create any services");
 		exit(1);
 	}
 
 	/* Expand svc_run() here so that we can call get_exportlist(). */
 	for (;;) {
 		if (got_sighup) {
 			get_exportlist();
 			got_sighup = 0;
 		}
 		readfds = svc_fdset;
 		switch (select(svc_maxfd + 1, &readfds, NULL, NULL, NULL)) {
 		case -1:
 			if (errno == EINTR)
                                 continue;
 			syslog(LOG_ERR, "mountd died: select: %m");
 			exit(1);
 		case 0:
 			continue;
 		default:
 			svc_getreqset(&readfds);
 		}
 	}
 } 
 
 /*
  * This routine creates and binds sockets on the appropriate
  * addresses. It gets called one time for each transport.
  * It returns 0 upon success, 1 for ingore the call and -1 to indicate
  * bind failed with EADDRINUSE.
  * Any file descriptors that have been created are stored in sock_fd and
  * the total count of them is maintained in sock_fdcnt.
  */
 static int
 create_service(struct netconfig *nconf)
 {
 	struct addrinfo hints, *res = NULL;
 	struct sockaddr_in *sin;
 	struct sockaddr_in6 *sin6;
 	struct __rpc_sockinfo si;
 	int aicode;
 	int fd;
 	int nhostsbak;
 	int one = 1;
 	int r;
 	u_int32_t host_addr[4];  /* IPv4 or IPv6 */
 	int mallocd_res;
 
 	if ((nconf->nc_semantics != NC_TPI_CLTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS_ORD))
 		return (1);	/* not my type */
 
 	/*
 	 * XXX - using RPC library internal functions.
 	 */
 	if (!__rpc_nconf2sockinfo(nconf, &si)) {
 		syslog(LOG_ERR, "cannot get information for %s",
 		    nconf->nc_netid);
 		return (1);
 	}
 
 	/* Get mountd's address on this transport */
 	memset(&hints, 0, sizeof hints);
 	hints.ai_family = si.si_af;
 	hints.ai_socktype = si.si_socktype;
 	hints.ai_protocol = si.si_proto;
 
 	/*
 	 * Bind to specific IPs if asked to
 	 */
 	nhostsbak = nhosts;
 	while (nhostsbak > 0) {
 		--nhostsbak;
 		sock_fd = realloc(sock_fd, (sock_fdcnt + 1) * sizeof(int));
 		if (sock_fd == NULL)
 			out_of_mem();
 		sock_fd[sock_fdcnt++] = -1;	/* Set invalid for now. */
 		mallocd_res = 0;
 
 		hints.ai_flags = AI_PASSIVE;
 
 		/*	
 		 * XXX - using RPC library internal functions.
 		 */
 		if ((fd = __rpc_nconf2fd(nconf)) < 0) {
 			int non_fatal = 0;
 	    		if (errno == EAFNOSUPPORT &&
 			    nconf->nc_semantics != NC_TPI_CLTS) 
 				non_fatal = 1;
 				
 			syslog(non_fatal ? LOG_DEBUG : LOG_ERR, 
 			    "cannot create socket for %s", nconf->nc_netid);
 			if (non_fatal != 0)
 				continue;
 			exit(1);
 		}
 
 		switch (hints.ai_family) {
 		case AF_INET:
 			if (inet_pton(AF_INET, hosts[nhostsbak],
 			    host_addr) == 1) {
 				hints.ai_flags |= AI_NUMERICHOST;
 			} else {
 				/*
 				 * Skip if we have an AF_INET6 address.
 				 */
 				if (inet_pton(AF_INET6, hosts[nhostsbak],
 				    host_addr) == 1) {
 					close(fd);
 					continue;
 				}
 			}
 			break;
 		case AF_INET6:
 			if (inet_pton(AF_INET6, hosts[nhostsbak],
 			    host_addr) == 1) {
 				hints.ai_flags |= AI_NUMERICHOST;
 			} else {
 				/*
 				 * Skip if we have an AF_INET address.
 				 */
 				if (inet_pton(AF_INET, hosts[nhostsbak],
 				    host_addr) == 1) {
 					close(fd);
 					continue;
 				}
 			}
 
 			/*
 			 * We're doing host-based access checks here, so don't
 			 * allow v4-in-v6 to confuse things. The kernel will
 			 * disable it by default on NFS sockets too.
 			 */
 			if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one,
 			    sizeof one) < 0) {
 				syslog(LOG_ERR,
 				    "can't disable v4-in-v6 on IPv6 socket");
 				exit(1);
 			}
 			break;
 		default:
 			break;
 		}
 
 		/*
 		 * If no hosts were specified, just bind to INADDR_ANY
 		 */
 		if (strcmp("*", hosts[nhostsbak]) == 0) {
 			if (svcport_str == NULL) {
 				res = malloc(sizeof(struct addrinfo));
 				if (res == NULL) 
 					out_of_mem();
 				mallocd_res = 1;
 				res->ai_flags = hints.ai_flags;
 				res->ai_family = hints.ai_family;
 				res->ai_protocol = hints.ai_protocol;
 				switch (res->ai_family) {
 				case AF_INET:
 					sin = malloc(sizeof(struct sockaddr_in));
 					if (sin == NULL) 
 						out_of_mem();
 					sin->sin_family = AF_INET;
 					sin->sin_port = htons(0);
 					sin->sin_addr.s_addr = htonl(INADDR_ANY);
 					res->ai_addr = (struct sockaddr*) sin;
 					res->ai_addrlen = (socklen_t)
 					    sizeof(struct sockaddr_in);
 					break;
 				case AF_INET6:
 					sin6 = malloc(sizeof(struct sockaddr_in6));
 					if (sin6 == NULL)
 						out_of_mem();
 					sin6->sin6_family = AF_INET6;
 					sin6->sin6_port = htons(0);
 					sin6->sin6_addr = in6addr_any;
 					res->ai_addr = (struct sockaddr*) sin6;
 					res->ai_addrlen = (socklen_t)
 					    sizeof(struct sockaddr_in6);
 					break;
 				default:
 					syslog(LOG_ERR, "bad addr fam %d",
 					    res->ai_family);
 					exit(1);
 				}
 			} else { 
 				if ((aicode = getaddrinfo(NULL, svcport_str,
 				    &hints, &res)) != 0) {
 					syslog(LOG_ERR,
 					    "cannot get local address for %s: %s",
 					    nconf->nc_netid,
 					    gai_strerror(aicode));
 					close(fd);
 					continue;
 				}
 			}
 		} else {
 			if ((aicode = getaddrinfo(hosts[nhostsbak], svcport_str,
 			    &hints, &res)) != 0) {
 				syslog(LOG_ERR,
 				    "cannot get local address for %s: %s",
 				    nconf->nc_netid, gai_strerror(aicode));
 				close(fd);
 				continue;
 			}
 		}
 
 		/* Store the fd. */
 		sock_fd[sock_fdcnt - 1] = fd;
 
 		/* Now, attempt the bind. */
 		r = bindresvport_sa(fd, res->ai_addr);
 		if (r != 0) {
 			if (errno == EADDRINUSE && mallocd_svcport != 0) {
 				if (mallocd_res != 0) {
 					free(res->ai_addr);
 					free(res);
 				} else
 					freeaddrinfo(res);
 				return (-1);
 			}
 			syslog(LOG_ERR, "bindresvport_sa: %m");
 			exit(1);
 		}
 
 		if (svcport_str == NULL) {
 			svcport_str = malloc(NI_MAXSERV * sizeof(char));
 			if (svcport_str == NULL)
 				out_of_mem();
 			mallocd_svcport = 1;
 
 			if (getnameinfo(res->ai_addr,
 			    res->ai_addr->sa_len, NULL, NI_MAXHOST,
 			    svcport_str, NI_MAXSERV * sizeof(char),
 			    NI_NUMERICHOST | NI_NUMERICSERV))
 				errx(1, "Cannot get port number");
 		}
 		if (mallocd_res != 0) {
 			free(res->ai_addr);
 			free(res);
 		} else
 			freeaddrinfo(res);
 		res = NULL;
 	}
 	return (0);
 }
 
 /*
  * Called after all the create_service() calls have succeeded, to complete
  * the setup and registration.
  */
 static void
 complete_service(struct netconfig *nconf, char *port_str)
 {
 	struct addrinfo hints, *res = NULL;
 	struct __rpc_sockinfo si;
 	struct netbuf servaddr;
 	SVCXPRT	*transp = NULL;
 	int aicode, fd, nhostsbak;
 	int registered = 0;
 
 	if ((nconf->nc_semantics != NC_TPI_CLTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS_ORD))
 		return;	/* not my type */
 
 	/*
 	 * XXX - using RPC library internal functions.
 	 */
 	if (!__rpc_nconf2sockinfo(nconf, &si)) {
 		syslog(LOG_ERR, "cannot get information for %s",
 		    nconf->nc_netid);
 		return;
 	}
 
 	nhostsbak = nhosts;
 	while (nhostsbak > 0) {
 		--nhostsbak;
 		if (sock_fdpos >= sock_fdcnt) {
 			/* Should never happen. */
 			syslog(LOG_ERR, "Ran out of socket fd's");
 			return;
 		}
 		fd = sock_fd[sock_fdpos++];
 		if (fd < 0)
 			continue;
 
 		if (nconf->nc_semantics != NC_TPI_CLTS)
 			listen(fd, SOMAXCONN);
 
 		if (nconf->nc_semantics == NC_TPI_CLTS )
 			transp = svc_dg_create(fd, 0, 0);
 		else 
 			transp = svc_vc_create(fd, RPC_MAXDATASIZE,
 			    RPC_MAXDATASIZE);
 
 		if (transp != (SVCXPRT *) NULL) {
 			if (!svc_reg(transp, MOUNTPROG, MOUNTVERS, mntsrv,
 			    NULL)) 
 				syslog(LOG_ERR,
 				    "can't register %s MOUNTVERS service",
 				    nconf->nc_netid);
 			if (!force_v2) {
 				if (!svc_reg(transp, MOUNTPROG, MOUNTVERS3,
 				    mntsrv, NULL)) 
 					syslog(LOG_ERR,
 					    "can't register %s MOUNTVERS3 service",
 					    nconf->nc_netid);
 			}
 		} else 
 			syslog(LOG_WARNING, "can't create %s services",
 			    nconf->nc_netid);
 
 		if (registered == 0) {
 			registered = 1;
 			memset(&hints, 0, sizeof hints);
 			hints.ai_flags = AI_PASSIVE;
 			hints.ai_family = si.si_af;
 			hints.ai_socktype = si.si_socktype;
 			hints.ai_protocol = si.si_proto;
 
 			if ((aicode = getaddrinfo(NULL, port_str, &hints,
 			    &res)) != 0) {
 				syslog(LOG_ERR, "cannot get local address: %s",
 				    gai_strerror(aicode));
 				exit(1);
 			}
 
 			servaddr.buf = malloc(res->ai_addrlen);
 			memcpy(servaddr.buf, res->ai_addr, res->ai_addrlen);
 			servaddr.len = res->ai_addrlen;
 
 			rpcb_set(MOUNTPROG, MOUNTVERS, nconf, &servaddr);
 			rpcb_set(MOUNTPROG, MOUNTVERS3, nconf, &servaddr);
 
 			xcreated++;
 			freeaddrinfo(res);
 		}
 	} /* end while */
 }
 
 /*
  * Clear out sockets after a failure to bind one of them, so that the
  * cycle of socket creation/binding can start anew.
  */
 static void
 clearout_service(void)
 {
 	int i;
 
 	for (i = 0; i < sock_fdcnt; i++) {
 		if (sock_fd[i] >= 0) {
 			shutdown(sock_fd[i], SHUT_RDWR);
 			close(sock_fd[i]);
 		}
 	}
 }
 
 static void
 usage(void)
 {
 	fprintf(stderr,
 		"usage: mountd [-2] [-d] [-e] [-l] [-n] [-p <port>] [-r] "
 		"[-S] [-h <bindip>] [export_file ...]\n");
 	exit(1);
 }
 
 /*
  * The mount rpc service
  */
 void
 mntsrv(struct svc_req *rqstp, SVCXPRT *transp)
 {
 	struct exportlist *ep;
 	struct dirlist *dp;
 	struct fhreturn fhr;
 	struct stat stb;
 	struct statfs fsb;
 	char host[NI_MAXHOST], numerichost[NI_MAXHOST];
 	int lookup_failed = 1;
 	struct sockaddr *saddr;
 	u_short sport;
 	char rpcpath[MNTPATHLEN + 1], dirpath[MAXPATHLEN];
 	int bad = 0, defset, hostset;
 	sigset_t sighup_mask;
 	int numsecflavors, *secflavorsp;
 
 	sigemptyset(&sighup_mask);
 	sigaddset(&sighup_mask, SIGHUP);
 	saddr = svc_getrpccaller(transp)->buf;
 	switch (saddr->sa_family) {
 	case AF_INET6:
 		sport = ntohs(((struct sockaddr_in6 *)saddr)->sin6_port);
 		break;
 	case AF_INET:
 		sport = ntohs(((struct sockaddr_in *)saddr)->sin_port);
 		break;
 	default:
 		syslog(LOG_ERR, "request from unknown address family");
 		return;
 	}
 	lookup_failed = getnameinfo(saddr, saddr->sa_len, host, sizeof host, 
 	    NULL, 0, 0);
 	getnameinfo(saddr, saddr->sa_len, numerichost,
 	    sizeof numerichost, NULL, 0, NI_NUMERICHOST);
 	switch (rqstp->rq_proc) {
 	case NULLPROC:
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_void, NULL))
 			syslog(LOG_ERR, "can't send reply");
 		return;
 	case MOUNTPROC_MNT:
 		if (sport >= IPPORT_RESERVED && resvport_only) {
 			syslog(LOG_NOTICE,
 			    "mount request from %s from unprivileged port",
 			    numerichost);
 			svcerr_weakauth(transp);
 			return;
 		}
 		if (!svc_getargs(transp, (xdrproc_t)xdr_dir, rpcpath)) {
 			syslog(LOG_NOTICE, "undecodable mount request from %s",
 			    numerichost);
 			svcerr_decode(transp);
 			return;
 		}
 
 		/*
 		 * Get the real pathname and make sure it is a directory
 		 * or a regular file if the -r option was specified
 		 * and it exists.
 		 */
 		if (realpath(rpcpath, dirpath) == NULL ||
 		    stat(dirpath, &stb) < 0 ||
 		    (!S_ISDIR(stb.st_mode) &&
 		    (dir_only || !S_ISREG(stb.st_mode))) ||
 		    statfs(dirpath, &fsb) < 0) {
 			chdir("/");	/* Just in case realpath doesn't */
 			syslog(LOG_NOTICE,
 			    "mount request from %s for non existent path %s",
 			    numerichost, dirpath);
 			if (debug)
 				warnx("stat failed on %s", dirpath);
 			bad = ENOENT;	/* We will send error reply later */
 		}
 
 		/* Check in the exports list */
 		sigprocmask(SIG_BLOCK, &sighup_mask, NULL);
 		ep = ex_search(&fsb.f_fsid);
 		hostset = defset = 0;
 		if (ep && (chk_host(ep->ex_defdir, saddr, &defset, &hostset,
 		    &numsecflavors, &secflavorsp) ||
 		    ((dp = dirp_search(ep->ex_dirl, dirpath)) &&
 		      chk_host(dp, saddr, &defset, &hostset, &numsecflavors,
 		       &secflavorsp)) ||
 		    (defset && scan_tree(ep->ex_defdir, saddr) == 0 &&
 		     scan_tree(ep->ex_dirl, saddr) == 0))) {
 			if (bad) {
 				if (!svc_sendreply(transp, (xdrproc_t)xdr_long,
 				    (caddr_t)&bad))
 					syslog(LOG_ERR, "can't send reply");
 				sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 				return;
 			}
 			if (hostset & DP_HOSTSET) {
 				fhr.fhr_flag = hostset;
 				fhr.fhr_numsecflavors = numsecflavors;
 				fhr.fhr_secflavors = secflavorsp;
 			} else {
 				fhr.fhr_flag = defset;
 				fhr.fhr_numsecflavors = ep->ex_defnumsecflavors;
 				fhr.fhr_secflavors = ep->ex_defsecflavors;
 			}
 			fhr.fhr_vers = rqstp->rq_vers;
 			/* Get the file handle */
 			memset(&fhr.fhr_fh, 0, sizeof(nfsfh_t));
 			if (getfh(dirpath, (fhandle_t *)&fhr.fhr_fh) < 0) {
 				bad = errno;
 				syslog(LOG_ERR, "can't get fh for %s", dirpath);
 				if (!svc_sendreply(transp, (xdrproc_t)xdr_long,
 				    (caddr_t)&bad))
 					syslog(LOG_ERR, "can't send reply");
 				sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 				return;
 			}
 			if (!svc_sendreply(transp, (xdrproc_t)xdr_fhs,
 			    (caddr_t)&fhr))
 				syslog(LOG_ERR, "can't send reply");
 			if (!lookup_failed)
 				add_mlist(host, dirpath);
 			else
 				add_mlist(numerichost, dirpath);
 			if (debug)
 				warnx("mount successful");
 			if (dolog)
 				syslog(LOG_NOTICE,
 				    "mount request succeeded from %s for %s",
 				    numerichost, dirpath);
 		} else {
 			bad = EACCES;
 			syslog(LOG_NOTICE,
 			    "mount request denied from %s for %s",
 			    numerichost, dirpath);
 		}
 
 		if (bad && !svc_sendreply(transp, (xdrproc_t)xdr_long,
 		    (caddr_t)&bad))
 			syslog(LOG_ERR, "can't send reply");
 		sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 		return;
 	case MOUNTPROC_DUMP:
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_mlist, (caddr_t)NULL))
 			syslog(LOG_ERR, "can't send reply");
 		else if (dolog)
 			syslog(LOG_NOTICE,
 			    "dump request succeeded from %s",
 			    numerichost);
 		return;
 	case MOUNTPROC_UMNT:
 		if (sport >= IPPORT_RESERVED && resvport_only) {
 			syslog(LOG_NOTICE,
 			    "umount request from %s from unprivileged port",
 			    numerichost);
 			svcerr_weakauth(transp);
 			return;
 		}
 		if (!svc_getargs(transp, (xdrproc_t)xdr_dir, rpcpath)) {
 			syslog(LOG_NOTICE, "undecodable umount request from %s",
 			    numerichost);
 			svcerr_decode(transp);
 			return;
 		}
 		if (realpath(rpcpath, dirpath) == NULL) {
 			syslog(LOG_NOTICE, "umount request from %s "
 			    "for non existent path %s",
 			    numerichost, dirpath);
 		}
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_void, (caddr_t)NULL))
 			syslog(LOG_ERR, "can't send reply");
 		if (!lookup_failed)
 			del_mlist(host, dirpath);
 		del_mlist(numerichost, dirpath);
 		if (dolog)
 			syslog(LOG_NOTICE,
 			    "umount request succeeded from %s for %s",
 			    numerichost, dirpath);
 		return;
 	case MOUNTPROC_UMNTALL:
 		if (sport >= IPPORT_RESERVED && resvport_only) {
 			syslog(LOG_NOTICE,
 			    "umountall request from %s from unprivileged port",
 			    numerichost);
 			svcerr_weakauth(transp);
 			return;
 		}
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_void, (caddr_t)NULL))
 			syslog(LOG_ERR, "can't send reply");
 		if (!lookup_failed)
 			del_mlist(host, NULL);
 		del_mlist(numerichost, NULL);
 		if (dolog)
 			syslog(LOG_NOTICE,
 			    "umountall request succeeded from %s",
 			    numerichost);
 		return;
 	case MOUNTPROC_EXPORT:
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_explist, (caddr_t)NULL))
 			if (!svc_sendreply(transp, (xdrproc_t)xdr_explist_brief,
 			    (caddr_t)NULL))
 				syslog(LOG_ERR, "can't send reply");
 		if (dolog)
 			syslog(LOG_NOTICE,
 			    "export request succeeded from %s",
 			    numerichost);
 		return;
 	default:
 		svcerr_noproc(transp);
 		return;
 	}
 }
 
 /*
  * Xdr conversion for a dirpath string
  */
 static int
 xdr_dir(XDR *xdrsp, char *dirp)
 {
 	return (xdr_string(xdrsp, &dirp, MNTPATHLEN));
 }
 
 /*
  * Xdr routine to generate file handle reply
  */
 static int
 xdr_fhs(XDR *xdrsp, caddr_t cp)
 {
 	struct fhreturn *fhrp = (struct fhreturn *)cp;
 	u_long ok = 0, len, auth;
 	int i;
 
 	if (!xdr_long(xdrsp, &ok))
 		return (0);
 	switch (fhrp->fhr_vers) {
 	case 1:
 		return (xdr_opaque(xdrsp, (caddr_t)&fhrp->fhr_fh, NFSX_V2FH));
 	case 3:
 		len = NFSX_V3FH;
 		if (!xdr_long(xdrsp, &len))
 			return (0);
 		if (!xdr_opaque(xdrsp, (caddr_t)&fhrp->fhr_fh, len))
 			return (0);
 		if (fhrp->fhr_numsecflavors) {
 			if (!xdr_int(xdrsp, &fhrp->fhr_numsecflavors))
 				return (0);
 			for (i = 0; i < fhrp->fhr_numsecflavors; i++)
 				if (!xdr_int(xdrsp, &fhrp->fhr_secflavors[i]))
 					return (0);
 			return (1);
 		} else {
 			auth = AUTH_SYS;
 			len = 1;
 			if (!xdr_long(xdrsp, &len))
 				return (0);
 			return (xdr_long(xdrsp, &auth));
 		}
 	}
 	return (0);
 }
 
 static int
 xdr_mlist(XDR *xdrsp, caddr_t cp __unused)
 {
 	struct mountlist *mlp;
 	int true = 1;
 	int false = 0;
 	char *strp;
 
 	mlp = mlhead;
 	while (mlp) {
 		if (!xdr_bool(xdrsp, &true))
 			return (0);
 		strp = &mlp->ml_host[0];
 		if (!xdr_string(xdrsp, &strp, MNTNAMLEN))
 			return (0);
 		strp = &mlp->ml_dirp[0];
 		if (!xdr_string(xdrsp, &strp, MNTPATHLEN))
 			return (0);
 		mlp = mlp->ml_next;
 	}
 	if (!xdr_bool(xdrsp, &false))
 		return (0);
 	return (1);
 }
 
 /*
  * Xdr conversion for export list
  */
 static int
 xdr_explist_common(XDR *xdrsp, caddr_t cp __unused, int brief)
 {
 	struct exportlist *ep;
 	int false = 0;
 	int putdef;
 	sigset_t sighup_mask;
 
 	sigemptyset(&sighup_mask);
 	sigaddset(&sighup_mask, SIGHUP);
 	sigprocmask(SIG_BLOCK, &sighup_mask, NULL);
 	ep = exphead;
 	while (ep) {
 		putdef = 0;
 		if (put_exlist(ep->ex_dirl, xdrsp, ep->ex_defdir,
 			       &putdef, brief))
 			goto errout;
 		if (ep->ex_defdir && putdef == 0 &&
 			put_exlist(ep->ex_defdir, xdrsp, (struct dirlist *)NULL,
 			&putdef, brief))
 			goto errout;
 		ep = ep->ex_next;
 	}
 	sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 	if (!xdr_bool(xdrsp, &false))
 		return (0);
 	return (1);
 errout:
 	sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 	return (0);
 }
 
 /*
  * Called from xdr_explist() to traverse the tree and export the
  * directory paths.
  */
 static int
 put_exlist(struct dirlist *dp, XDR *xdrsp, struct dirlist *adp, int *putdefp,
 	int brief)
 {
 	struct grouplist *grp;
 	struct hostlist *hp;
 	int true = 1;
 	int false = 0;
 	int gotalldir = 0;
 	char *strp;
 
 	if (dp) {
 		if (put_exlist(dp->dp_left, xdrsp, adp, putdefp, brief))
 			return (1);
 		if (!xdr_bool(xdrsp, &true))
 			return (1);
 		strp = dp->dp_dirp;
 		if (!xdr_string(xdrsp, &strp, MNTPATHLEN))
 			return (1);
 		if (adp && !strcmp(dp->dp_dirp, adp->dp_dirp)) {
 			gotalldir = 1;
 			*putdefp = 1;
 		}
 		if (brief) {
 			if (!xdr_bool(xdrsp, &true))
 				return (1);
 			strp = "(...)";
 			if (!xdr_string(xdrsp, &strp, MNTPATHLEN))
 				return (1);
 		} else if ((dp->dp_flag & DP_DEFSET) == 0 &&
 		    (gotalldir == 0 || (adp->dp_flag & DP_DEFSET) == 0)) {
 			hp = dp->dp_hosts;
 			while (hp) {
 				grp = hp->ht_grp;
 				if (grp->gr_type == GT_HOST) {
 					if (!xdr_bool(xdrsp, &true))
 						return (1);
 					strp = grp->gr_ptr.gt_addrinfo->ai_canonname;
 					if (!xdr_string(xdrsp, &strp,
 					    MNTNAMLEN))
 						return (1);
 				} else if (grp->gr_type == GT_NET) {
 					if (!xdr_bool(xdrsp, &true))
 						return (1);
 					strp = grp->gr_ptr.gt_net.nt_name;
 					if (!xdr_string(xdrsp, &strp,
 					    MNTNAMLEN))
 						return (1);
 				}
 				hp = hp->ht_next;
 				if (gotalldir && hp == (struct hostlist *)NULL) {
 					hp = adp->dp_hosts;
 					gotalldir = 0;
 				}
 			}
 		}
 		if (!xdr_bool(xdrsp, &false))
 			return (1);
 		if (put_exlist(dp->dp_right, xdrsp, adp, putdefp, brief))
 			return (1);
 	}
 	return (0);
 }
 
 static int
 xdr_explist(XDR *xdrsp, caddr_t cp)
 {
 
 	return xdr_explist_common(xdrsp, cp, 0);
 }
 
 static int
 xdr_explist_brief(XDR *xdrsp, caddr_t cp)
 {
 
 	return xdr_explist_common(xdrsp, cp, 1);
 }
 
 static char *line;
 static size_t linesize;
 static FILE *exp_file;
 
 /*
  * Get the export list from one, currently open file
  */
 static void
 get_exportlist_one(void)
 {
 	struct exportlist *ep, *ep2;
 	struct grouplist *grp, *tgrp;
 	struct exportlist **epp;
 	struct dirlist *dirhead;
 	struct statfs fsb;
 	struct xucred anon;
 	char *cp, *endcp, *dirp, *hst, *usr, *dom, savedc;
 	int len, has_host, exflags, got_nondir, dirplen, netgrp;
 
 	v4root_phase = 0;
 	dirhead = (struct dirlist *)NULL;
 	while (get_line()) {
 		if (debug)
 			warnx("got line %s", line);
 		cp = line;
 		nextfield(&cp, &endcp);
 		if (*cp == '#')
 			goto nextline;
 
 		/*
 		 * Set defaults.
 		 */
 		has_host = FALSE;
 		anon = def_anon;
 		exflags = MNT_EXPORTED;
 		got_nondir = 0;
 		opt_flags = 0;
 		ep = (struct exportlist *)NULL;
 		dirp = NULL;
 
 		/*
 		 * Handle the V4 root dir.
 		 */
 		if (*cp == 'V' && *(cp + 1) == '4' && *(cp + 2) == ':') {
 			/*
 			 * V4: just indicates that it is the v4 root point,
 			 * so skip over that and set v4root_phase.
 			 */
 			if (v4root_phase > 0) {
 				syslog(LOG_ERR, "V4:duplicate line, ignored");
 				goto nextline;
 			}
 			v4root_phase = 1;
 			cp += 3;
 			nextfield(&cp, &endcp);
 		}
 
 		/*
 		 * Create new exports list entry
 		 */
 		len = endcp-cp;
 		tgrp = grp = get_grp();
 		while (len > 0) {
 			if (len > MNTNAMLEN) {
 			    getexp_err(ep, tgrp);
 			    goto nextline;
 			}
 			if (*cp == '-') {
 			    if (ep == (struct exportlist *)NULL) {
 				getexp_err(ep, tgrp);
 				goto nextline;
 			    }
 			    if (debug)
 				warnx("doing opt %s", cp);
 			    got_nondir = 1;
 			    if (do_opt(&cp, &endcp, ep, grp, &has_host,
 				&exflags, &anon)) {
 				getexp_err(ep, tgrp);
 				goto nextline;
 			    }
 			} else if (*cp == '/') {
 			    savedc = *endcp;
 			    *endcp = '\0';
 			    if (v4root_phase > 1) {
 				    if (dirp != NULL) {
 					syslog(LOG_ERR, "Multiple V4 dirs");
 					getexp_err(ep, tgrp);
 					goto nextline;
 				    }
 			    }
 			    if (check_dirpath(cp) &&
 				statfs(cp, &fsb) >= 0) {
 				if ((fsb.f_flags & MNT_AUTOMOUNTED) != 0)
 				    syslog(LOG_ERR, "Warning: exporting of "
 					"automounted fs %s not supported", cp);
 				if (got_nondir) {
 				    syslog(LOG_ERR, "dirs must be first");
 				    getexp_err(ep, tgrp);
 				    goto nextline;
 				}
 				if (v4root_phase == 1) {
 				    if (dirp != NULL) {
 					syslog(LOG_ERR, "Multiple V4 dirs");
 					getexp_err(ep, tgrp);
 					goto nextline;
 				    }
 				    if (strlen(v4root_dirpath) == 0) {
 					strlcpy(v4root_dirpath, cp,
 					    sizeof (v4root_dirpath));
 				    } else if (strcmp(v4root_dirpath, cp)
 					!= 0) {
 					syslog(LOG_ERR,
 					    "different V4 dirpath %s", cp);
 					getexp_err(ep, tgrp);
 					goto nextline;
 				    }
 				    dirp = cp;
 				    v4root_phase = 2;
 				    got_nondir = 1;
 				    ep = get_exp();
 				} else {
 				    if (ep) {
 					if (ep->ex_fs.val[0] !=
 					    fsb.f_fsid.val[0] ||
 					    ep->ex_fs.val[1] !=
 					    fsb.f_fsid.val[1]) {
 						getexp_err(ep, tgrp);
 						goto nextline;
 					}
 				    } else {
 					/*
 					 * See if this directory is already
 					 * in the list.
 					 */
 					ep = ex_search(&fsb.f_fsid);
 					if (ep == (struct exportlist *)NULL) {
 					    ep = get_exp();
 					    ep->ex_fs = fsb.f_fsid;
 					    ep->ex_fsdir = (char *)malloc
 					        (strlen(fsb.f_mntonname) + 1);
 					    if (ep->ex_fsdir)
 						strcpy(ep->ex_fsdir,
 						    fsb.f_mntonname);
 					    else
 						out_of_mem();
 					    if (debug)
 						warnx(
 						  "making new ep fs=0x%x,0x%x",
 						  fsb.f_fsid.val[0],
 						  fsb.f_fsid.val[1]);
 					} else if (debug)
 					    warnx("found ep fs=0x%x,0x%x",
 						fsb.f_fsid.val[0],
 						fsb.f_fsid.val[1]);
 				    }
 
 				    /*
 				     * Add dirpath to export mount point.
 				     */
 				    dirp = add_expdir(&dirhead, cp, len);
 				    dirplen = len;
 				}
 			    } else {
 				getexp_err(ep, tgrp);
 				goto nextline;
 			    }
 			    *endcp = savedc;
 			} else {
 			    savedc = *endcp;
 			    *endcp = '\0';
 			    got_nondir = 1;
 			    if (ep == (struct exportlist *)NULL) {
 				getexp_err(ep, tgrp);
 				goto nextline;
 			    }
 
 			    /*
 			     * Get the host or netgroup.
 			     */
 			    setnetgrent(cp);
 			    netgrp = getnetgrent(&hst, &usr, &dom);
 			    do {
 				if (has_host) {
 				    grp->gr_next = get_grp();
 				    grp = grp->gr_next;
 				}
 				if (netgrp) {
 				    if (hst == 0) {
 					syslog(LOG_ERR,
 				"null hostname in netgroup %s, skipping", cp);
 					grp->gr_type = GT_IGNORE;
 				    } else if (get_host(hst, grp, tgrp)) {
 					syslog(LOG_ERR,
 			"bad host %s in netgroup %s, skipping", hst, cp);
 					grp->gr_type = GT_IGNORE;
 				    }
 				} else if (get_host(cp, grp, tgrp)) {
 				    syslog(LOG_ERR, "bad host %s, skipping", cp);
 				    grp->gr_type = GT_IGNORE;
 				}
 				has_host = TRUE;
 			    } while (netgrp && getnetgrent(&hst, &usr, &dom));
 			    endnetgrent();
 			    *endcp = savedc;
 			}
 			cp = endcp;
 			nextfield(&cp, &endcp);
 			len = endcp - cp;
 		}
 		if (check_options(dirhead)) {
 			getexp_err(ep, tgrp);
 			goto nextline;
 		}
 		if (!has_host) {
 			grp->gr_type = GT_DEFAULT;
 			if (debug)
 				warnx("adding a default entry");
 
 		/*
 		 * Don't allow a network export coincide with a list of
 		 * host(s) on the same line.
 		 */
 		} else if ((opt_flags & OP_NET) && tgrp->gr_next) {
 			syslog(LOG_ERR, "network/host conflict");
 			getexp_err(ep, tgrp);
 			goto nextline;
 
 		/*
 		 * If an export list was specified on this line, make sure
 		 * that we have at least one valid entry, otherwise skip it.
 		 */
 		} else {
 			grp = tgrp;
 			while (grp && grp->gr_type == GT_IGNORE)
 				grp = grp->gr_next;
 			if (! grp) {
 			    getexp_err(ep, tgrp);
 			    goto nextline;
 			}
 		}
 
 		if (v4root_phase == 1) {
 			syslog(LOG_ERR, "V4:root, no dirp, ignored");
 			getexp_err(ep, tgrp);
 			goto nextline;
 		}
 
 		/*
 		 * Loop through hosts, pushing the exports into the kernel.
 		 * After loop, tgrp points to the start of the list and
 		 * grp points to the last entry in the list.
 		 */
 		grp = tgrp;
 		do {
 			if (do_mount(ep, grp, exflags, &anon, dirp, dirplen,
 			    &fsb)) {
 				getexp_err(ep, tgrp);
 				goto nextline;
 			}
 		} while (grp->gr_next && (grp = grp->gr_next));
 
 		/*
 		 * For V4: don't enter in mount lists.
 		 */
 		if (v4root_phase > 0 && v4root_phase <= 2) {
 			/*
 			 * Since these structures aren't used by mountd,
 			 * free them up now.
 			 */
 			if (ep != NULL)
 				free_exp(ep);
 			while (tgrp != NULL) {
 				grp = tgrp;
 				tgrp = tgrp->gr_next;
 				free_grp(grp);
 			}
 			goto nextline;
 		}
 
 		/*
 		 * Success. Update the data structures.
 		 */
 		if (has_host) {
 			hang_dirp(dirhead, tgrp, ep, opt_flags);
 			grp->gr_next = grphead;
 			grphead = tgrp;
 		} else {
 			hang_dirp(dirhead, (struct grouplist *)NULL, ep,
 				opt_flags);
 			free_grp(grp);
 		}
 		dirhead = (struct dirlist *)NULL;
 		if ((ep->ex_flag & EX_LINKED) == 0) {
 			ep2 = exphead;
 			epp = &exphead;
 
 			/*
 			 * Insert in the list in alphabetical order.
 			 */
 			while (ep2 && strcmp(ep2->ex_fsdir, ep->ex_fsdir) < 0) {
 				epp = &ep2->ex_next;
 				ep2 = ep2->ex_next;
 			}
 			if (ep2)
 				ep->ex_next = ep2;
 			*epp = ep;
 			ep->ex_flag |= EX_LINKED;
 		}
 nextline:
 		v4root_phase = 0;
 		if (dirhead) {
 			free_dir(dirhead);
 			dirhead = (struct dirlist *)NULL;
 		}
 	}
 }
 
 /*
  * Get the export list from all specified files
  */
 static void
 get_exportlist(void)
 {
 	struct exportlist *ep, *ep2;
 	struct grouplist *grp, *tgrp;
 	struct export_args export;
 	struct iovec *iov;
 	struct statfs *fsp, *mntbufp;
 	struct xvfsconf vfc;
 	char errmsg[255];
 	int num, i;
 	int iovlen;
 	int done;
 	struct nfsex_args eargs;
 
 	if (suspend_nfsd != 0)
 		(void)nfssvc(NFSSVC_SUSPENDNFSD, NULL);
 	v4root_dirpath[0] = '\0';
 	bzero(&export, sizeof(export));
 	export.ex_flags = MNT_DELEXPORT;
 	iov = NULL;
 	iovlen = 0;
 	bzero(errmsg, sizeof(errmsg));
 
 	/*
 	 * First, get rid of the old list
 	 */
 	ep = exphead;
 	while (ep) {
 		ep2 = ep;
 		ep = ep->ex_next;
 		free_exp(ep2);
 	}
 	exphead = (struct exportlist *)NULL;
 
 	grp = grphead;
 	while (grp) {
 		tgrp = grp;
 		grp = grp->gr_next;
 		free_grp(tgrp);
 	}
 	grphead = (struct grouplist *)NULL;
 
 	/*
 	 * and the old V4 root dir.
 	 */
 	bzero(&eargs, sizeof (eargs));
 	eargs.export.ex_flags = MNT_DELEXPORT;
 	if (nfssvc(NFSSVC_V4ROOTEXPORT, (caddr_t)&eargs) < 0 &&
 	    errno != ENOENT)
 		syslog(LOG_ERR, "Can't delete exports for V4:");
 
 	/*
 	 * and clear flag that notes if a public fh has been exported.
 	 */
 	has_publicfh = 0;
 
 	/*
 	 * And delete exports that are in the kernel for all local
 	 * filesystems.
 	 * XXX: Should know how to handle all local exportable filesystems.
 	 */
 	num = getmntinfo(&mntbufp, MNT_NOWAIT);
 
 	if (num > 0) {
 		build_iovec(&iov, &iovlen, "fstype", NULL, 0);
 		build_iovec(&iov, &iovlen, "fspath", NULL, 0);
 		build_iovec(&iov, &iovlen, "from", NULL, 0);
 		build_iovec(&iov, &iovlen, "update", NULL, 0);
 		build_iovec(&iov, &iovlen, "export", &export, sizeof(export));
 		build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg));
 	}
 
 	for (i = 0; i < num; i++) {
 		fsp = &mntbufp[i];
 		if (getvfsbyname(fsp->f_fstypename, &vfc) != 0) {
 			syslog(LOG_ERR, "getvfsbyname() failed for %s",
 			    fsp->f_fstypename);
 			continue;
 		}
 
 		/*
 		 * We do not need to delete "export" flag from
 		 * filesystems that do not have it set.
 		 */
 		if (!(fsp->f_flags & MNT_EXPORTED))
 		    continue;
 		/*
 		 * Do not delete export for network filesystem by
 		 * passing "export" arg to nmount().
 		 * It only makes sense to do this for local filesystems.
 		 */
 		if (vfc.vfc_flags & VFCF_NETWORK)
 			continue;
 
 		iov[1].iov_base = fsp->f_fstypename;
 		iov[1].iov_len = strlen(fsp->f_fstypename) + 1;
 		iov[3].iov_base = fsp->f_mntonname;
 		iov[3].iov_len = strlen(fsp->f_mntonname) + 1;
 		iov[5].iov_base = fsp->f_mntfromname;
 		iov[5].iov_len = strlen(fsp->f_mntfromname) + 1;
 		errmsg[0] = '\0';
 
 		/*
 		 * EXDEV is returned when path exists but is not a
 		 * mount point.  May happens if raced with unmount.
 		 */
 		if (nmount(iov, iovlen, fsp->f_flags) < 0 &&
 		    errno != ENOENT && errno != ENOTSUP && errno != EXDEV) {
 			syslog(LOG_ERR,
 			    "can't delete exports for %s: %m %s",
 			    fsp->f_mntonname, errmsg);
 		}
 	}
 
 	if (iov != NULL) {
 		/* Free strings allocated by strdup() in getmntopts.c */
 		free(iov[0].iov_base); /* fstype */
 		free(iov[2].iov_base); /* fspath */
 		free(iov[4].iov_base); /* from */
 		free(iov[6].iov_base); /* update */
 		free(iov[8].iov_base); /* export */
 		free(iov[10].iov_base); /* errmsg */
 
 		/* free iov, allocated by realloc() */
 		free(iov);
 		iovlen = 0;
 	}
 
 	/*
 	 * Read in the exports file and build the list, calling
 	 * nmount() as we go along to push the export rules into the kernel.
 	 */
 	done = 0;
 	for (i = 0; exnames[i] != NULL; i++) {
 		if (debug)
 			warnx("reading exports from %s", exnames[i]);
 		if ((exp_file = fopen(exnames[i], "r")) == NULL) {
 			syslog(LOG_WARNING, "can't open %s", exnames[i]);
 			continue;
 		}
 		get_exportlist_one();
 		fclose(exp_file);
 		done++;
 	}
 	if (done == 0) {
 		syslog(LOG_ERR, "can't open any exports file");
 		exit(2);
 	}
 
 	/*
 	 * If there was no public fh, clear any previous one set.
 	 */
 	if (has_publicfh == 0)
 		(void) nfssvc(NFSSVC_NOPUBLICFH, NULL);
 
 	/* Resume the nfsd. If they weren't suspended, this is harmless. */
 	(void)nfssvc(NFSSVC_RESUMENFSD, NULL);
 }
 
 /*
  * Allocate an export list element
  */
 static struct exportlist *
 get_exp(void)
 {
 	struct exportlist *ep;
 
 	ep = (struct exportlist *)calloc(1, sizeof (struct exportlist));
 	if (ep == (struct exportlist *)NULL)
 		out_of_mem();
 	return (ep);
 }
 
 /*
  * Allocate a group list element
  */
 static struct grouplist *
 get_grp(void)
 {
 	struct grouplist *gp;
 
 	gp = (struct grouplist *)calloc(1, sizeof (struct grouplist));
 	if (gp == (struct grouplist *)NULL)
 		out_of_mem();
 	return (gp);
 }
 
 /*
  * Clean up upon an error in get_exportlist().
  */
 static void
 getexp_err(struct exportlist *ep, struct grouplist *grp)
 {
 	struct grouplist *tgrp;
 
 	if (!(opt_flags & OP_QUIET))
 		syslog(LOG_ERR, "bad exports list line %s", line);
 	if (ep && (ep->ex_flag & EX_LINKED) == 0)
 		free_exp(ep);
 	while (grp) {
 		tgrp = grp;
 		grp = grp->gr_next;
 		free_grp(tgrp);
 	}
 }
 
 /*
  * Search the export list for a matching fs.
  */
 static struct exportlist *
 ex_search(fsid_t *fsid)
 {
 	struct exportlist *ep;
 
 	ep = exphead;
 	while (ep) {
 		if (ep->ex_fs.val[0] == fsid->val[0] &&
 		    ep->ex_fs.val[1] == fsid->val[1])
 			return (ep);
 		ep = ep->ex_next;
 	}
 	return (ep);
 }
 
 /*
  * Add a directory path to the list.
  */
 static char *
 add_expdir(struct dirlist **dpp, char *cp, int len)
 {
 	struct dirlist *dp;
 
 	dp = (struct dirlist *)malloc(sizeof (struct dirlist) + len);
 	if (dp == (struct dirlist *)NULL)
 		out_of_mem();
 	dp->dp_left = *dpp;
 	dp->dp_right = (struct dirlist *)NULL;
 	dp->dp_flag = 0;
 	dp->dp_hosts = (struct hostlist *)NULL;
 	strcpy(dp->dp_dirp, cp);
 	*dpp = dp;
 	return (dp->dp_dirp);
 }
 
 /*
  * Hang the dir list element off the dirpath binary tree as required
  * and update the entry for host.
  */
 static void
 hang_dirp(struct dirlist *dp, struct grouplist *grp, struct exportlist *ep,
 	int flags)
 {
 	struct hostlist *hp;
 	struct dirlist *dp2;
 
 	if (flags & OP_ALLDIRS) {
 		if (ep->ex_defdir)
 			free((caddr_t)dp);
 		else
 			ep->ex_defdir = dp;
 		if (grp == (struct grouplist *)NULL) {
 			ep->ex_defdir->dp_flag |= DP_DEFSET;
 			/* Save the default security flavors list. */
 			ep->ex_defnumsecflavors = ep->ex_numsecflavors;
 			if (ep->ex_numsecflavors > 0)
 				memcpy(ep->ex_defsecflavors, ep->ex_secflavors,
 				    sizeof(ep->ex_secflavors));
 		} else while (grp) {
 			hp = get_ht();
 			hp->ht_grp = grp;
 			hp->ht_next = ep->ex_defdir->dp_hosts;
 			ep->ex_defdir->dp_hosts = hp;
 			/* Save the security flavors list for this host set. */
 			grp->gr_numsecflavors = ep->ex_numsecflavors;
 			if (ep->ex_numsecflavors > 0)
 				memcpy(grp->gr_secflavors, ep->ex_secflavors,
 				    sizeof(ep->ex_secflavors));
 			grp = grp->gr_next;
 		}
 	} else {
 
 		/*
 		 * Loop through the directories adding them to the tree.
 		 */
 		while (dp) {
 			dp2 = dp->dp_left;
 			add_dlist(&ep->ex_dirl, dp, grp, flags, ep);
 			dp = dp2;
 		}
 	}
 }
 
 /*
  * Traverse the binary tree either updating a node that is already there
  * for the new directory or adding the new node.
  */
 static void
 add_dlist(struct dirlist **dpp, struct dirlist *newdp, struct grouplist *grp,
 	int flags, struct exportlist *ep)
 {
 	struct dirlist *dp;
 	struct hostlist *hp;
 	int cmp;
 
 	dp = *dpp;
 	if (dp) {
 		cmp = strcmp(dp->dp_dirp, newdp->dp_dirp);
 		if (cmp > 0) {
 			add_dlist(&dp->dp_left, newdp, grp, flags, ep);
 			return;
 		} else if (cmp < 0) {
 			add_dlist(&dp->dp_right, newdp, grp, flags, ep);
 			return;
 		} else
 			free((caddr_t)newdp);
 	} else {
 		dp = newdp;
 		dp->dp_left = (struct dirlist *)NULL;
 		*dpp = dp;
 	}
 	if (grp) {
 
 		/*
 		 * Hang all of the host(s) off of the directory point.
 		 */
 		do {
 			hp = get_ht();
 			hp->ht_grp = grp;
 			hp->ht_next = dp->dp_hosts;
 			dp->dp_hosts = hp;
 			/* Save the security flavors list for this host set. */
 			grp->gr_numsecflavors = ep->ex_numsecflavors;
 			if (ep->ex_numsecflavors > 0)
 				memcpy(grp->gr_secflavors, ep->ex_secflavors,
 				    sizeof(ep->ex_secflavors));
 			grp = grp->gr_next;
 		} while (grp);
 	} else {
 		dp->dp_flag |= DP_DEFSET;
 		/* Save the default security flavors list. */
 		ep->ex_defnumsecflavors = ep->ex_numsecflavors;
 		if (ep->ex_numsecflavors > 0)
 			memcpy(ep->ex_defsecflavors, ep->ex_secflavors,
 			    sizeof(ep->ex_secflavors));
 	}
 }
 
 /*
  * Search for a dirpath on the export point.
  */
 static struct dirlist *
 dirp_search(struct dirlist *dp, char *dirp)
 {
 	int cmp;
 
 	if (dp) {
 		cmp = strcmp(dp->dp_dirp, dirp);
 		if (cmp > 0)
 			return (dirp_search(dp->dp_left, dirp));
 		else if (cmp < 0)
 			return (dirp_search(dp->dp_right, dirp));
 		else
 			return (dp);
 	}
 	return (dp);
 }
 
 /*
  * Scan for a host match in a directory tree.
  */
 static int
 chk_host(struct dirlist *dp, struct sockaddr *saddr, int *defsetp,
 	int *hostsetp, int *numsecflavors, int **secflavorsp)
 {
 	struct hostlist *hp;
 	struct grouplist *grp;
 	struct addrinfo *ai;
 
 	if (dp) {
 		if (dp->dp_flag & DP_DEFSET)
 			*defsetp = dp->dp_flag;
 		hp = dp->dp_hosts;
 		while (hp) {
 			grp = hp->ht_grp;
 			switch (grp->gr_type) {
 			case GT_HOST:
 				ai = grp->gr_ptr.gt_addrinfo;
 				for (; ai; ai = ai->ai_next) {
 					if (!sacmp(ai->ai_addr, saddr, NULL)) {
 						*hostsetp =
 						    (hp->ht_flag | DP_HOSTSET);
 						if (numsecflavors != NULL) {
 							*numsecflavors =
 							    grp->gr_numsecflavors;
 							*secflavorsp =
 							    grp->gr_secflavors;
 						}
 						return (1);
 					}
 				}
 				break;
 			case GT_NET:
 				if (!sacmp(saddr, (struct sockaddr *)
 				    &grp->gr_ptr.gt_net.nt_net,
 				    (struct sockaddr *)
 				    &grp->gr_ptr.gt_net.nt_mask)) {
 					*hostsetp = (hp->ht_flag | DP_HOSTSET);
 					if (numsecflavors != NULL) {
 						*numsecflavors =
 						    grp->gr_numsecflavors;
 						*secflavorsp =
 						    grp->gr_secflavors;
 					}
 					return (1);
 				}
 				break;
 			}
 			hp = hp->ht_next;
 		}
 	}
 	return (0);
 }
 
 /*
  * Scan tree for a host that matches the address.
  */
 static int
 scan_tree(struct dirlist *dp, struct sockaddr *saddr)
 {
 	int defset, hostset;
 
 	if (dp) {
 		if (scan_tree(dp->dp_left, saddr))
 			return (1);
 		if (chk_host(dp, saddr, &defset, &hostset, NULL, NULL))
 			return (1);
 		if (scan_tree(dp->dp_right, saddr))
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Traverse the dirlist tree and free it up.
  */
 static void
 free_dir(struct dirlist *dp)
 {
 
 	if (dp) {
 		free_dir(dp->dp_left);
 		free_dir(dp->dp_right);
 		free_host(dp->dp_hosts);
 		free((caddr_t)dp);
 	}
 }
 
 /*
  * Parse a colon separated list of security flavors
  */
 static int
 parsesec(char *seclist, struct exportlist *ep)
 {
 	char *cp, savedc;
 	int flavor;
 
 	ep->ex_numsecflavors = 0;
 	for (;;) {
 		cp = strchr(seclist, ':');
 		if (cp) {
 			savedc = *cp;
 			*cp = '\0';
 		}
 
 		if (!strcmp(seclist, "sys"))
 			flavor = AUTH_SYS;
 		else if (!strcmp(seclist, "krb5"))
 			flavor = RPCSEC_GSS_KRB5;
 		else if (!strcmp(seclist, "krb5i"))
 			flavor = RPCSEC_GSS_KRB5I;
 		else if (!strcmp(seclist, "krb5p"))
 			flavor = RPCSEC_GSS_KRB5P;
 		else {
 			if (cp)
 				*cp = savedc;
 			syslog(LOG_ERR, "bad sec flavor: %s", seclist);
 			return (1);
 		}
 		if (ep->ex_numsecflavors == MAXSECFLAVORS) {
 			if (cp)
 				*cp = savedc;
 			syslog(LOG_ERR, "too many sec flavors: %s", seclist);
 			return (1);
 		}
 		ep->ex_secflavors[ep->ex_numsecflavors] = flavor;
 		ep->ex_numsecflavors++;
 		if (cp) {
 			*cp = savedc;
 			seclist = cp + 1;
 		} else {
 			break;
 		}
 	}
 	return (0);
 }
 
 /*
  * Parse the option string and update fields.
  * Option arguments may either be -<option>=<value> or
  * -<option> <value>
  */
 static int
 do_opt(char **cpp, char **endcpp, struct exportlist *ep, struct grouplist *grp,
 	int *has_hostp, int *exflagsp, struct xucred *cr)
 {
 	char *cpoptarg, *cpoptend;
 	char *cp, *endcp, *cpopt, savedc, savedc2;
 	int allflag, usedarg;
 
 	savedc2 = '\0';
 	cpopt = *cpp;
 	cpopt++;
 	cp = *endcpp;
 	savedc = *cp;
 	*cp = '\0';
 	while (cpopt && *cpopt) {
 		allflag = 1;
 		usedarg = -2;
 		if ((cpoptend = strchr(cpopt, ','))) {
 			*cpoptend++ = '\0';
 			if ((cpoptarg = strchr(cpopt, '=')))
 				*cpoptarg++ = '\0';
 		} else {
 			if ((cpoptarg = strchr(cpopt, '=')))
 				*cpoptarg++ = '\0';
 			else {
 				*cp = savedc;
 				nextfield(&cp, &endcp);
 				**endcpp = '\0';
 				if (endcp > cp && *cp != '-') {
 					cpoptarg = cp;
 					savedc2 = *endcp;
 					*endcp = '\0';
 					usedarg = 0;
 				}
 			}
 		}
 		if (!strcmp(cpopt, "ro") || !strcmp(cpopt, "o")) {
 			*exflagsp |= MNT_EXRDONLY;
 		} else if (cpoptarg && (!strcmp(cpopt, "maproot") ||
 		    !(allflag = strcmp(cpopt, "mapall")) ||
 		    !strcmp(cpopt, "root") || !strcmp(cpopt, "r"))) {
 			usedarg++;
 			parsecred(cpoptarg, cr);
 			if (allflag == 0) {
 				*exflagsp |= MNT_EXPORTANON;
 				opt_flags |= OP_MAPALL;
 			} else
 				opt_flags |= OP_MAPROOT;
 		} else if (cpoptarg && (!strcmp(cpopt, "mask") ||
 		    !strcmp(cpopt, "m"))) {
 			if (get_net(cpoptarg, &grp->gr_ptr.gt_net, 1)) {
 				syslog(LOG_ERR, "bad mask: %s", cpoptarg);
 				return (1);
 			}
 			usedarg++;
 			opt_flags |= OP_MASK;
 		} else if (cpoptarg && (!strcmp(cpopt, "network") ||
 			!strcmp(cpopt, "n"))) {
 			if (strchr(cpoptarg, '/') != NULL) {
 				if (debug)
 					fprintf(stderr, "setting OP_MASKLEN\n");
 				opt_flags |= OP_MASKLEN;
 			}
 			if (grp->gr_type != GT_NULL) {
 				syslog(LOG_ERR, "network/host conflict");
 				return (1);
 			} else if (get_net(cpoptarg, &grp->gr_ptr.gt_net, 0)) {
 				syslog(LOG_ERR, "bad net: %s", cpoptarg);
 				return (1);
 			}
 			grp->gr_type = GT_NET;
 			*has_hostp = 1;
 			usedarg++;
 			opt_flags |= OP_NET;
 		} else if (!strcmp(cpopt, "alldirs")) {
 			opt_flags |= OP_ALLDIRS;
 		} else if (!strcmp(cpopt, "public")) {
 			*exflagsp |= MNT_EXPUBLIC;
 		} else if (!strcmp(cpopt, "webnfs")) {
 			*exflagsp |= (MNT_EXPUBLIC|MNT_EXRDONLY|MNT_EXPORTANON);
 			opt_flags |= OP_MAPALL;
 		} else if (cpoptarg && !strcmp(cpopt, "index")) {
 			ep->ex_indexfile = strdup(cpoptarg);
 		} else if (!strcmp(cpopt, "quiet")) {
 			opt_flags |= OP_QUIET;
 		} else if (cpoptarg && !strcmp(cpopt, "sec")) {
 			if (parsesec(cpoptarg, ep))
 				return (1);
 			opt_flags |= OP_SEC;
 			usedarg++;
 		} else {
 			syslog(LOG_ERR, "bad opt %s", cpopt);
 			return (1);
 		}
 		if (usedarg >= 0) {
 			*endcp = savedc2;
 			**endcpp = savedc;
 			if (usedarg > 0) {
 				*cpp = cp;
 				*endcpp = endcp;
 			}
 			return (0);
 		}
 		cpopt = cpoptend;
 	}
 	**endcpp = savedc;
 	return (0);
 }
 
 /*
  * Translate a character string to the corresponding list of network
  * addresses for a hostname.
  */
 static int
 get_host(char *cp, struct grouplist *grp, struct grouplist *tgrp)
 {
 	struct grouplist *checkgrp;
 	struct addrinfo *ai, *tai, hints;
 	int ecode;
 	char host[NI_MAXHOST];
 
 	if (grp->gr_type != GT_NULL) {
 		syslog(LOG_ERR, "Bad netgroup type for ip host %s", cp);
 		return (1);
 	}
 	memset(&hints, 0, sizeof hints);
 	hints.ai_flags = AI_CANONNAME;
 	hints.ai_protocol = IPPROTO_UDP;
 	ecode = getaddrinfo(cp, NULL, &hints, &ai);
 	if (ecode != 0) {
 		syslog(LOG_ERR,"can't get address info for host %s", cp);
 		return 1;
 	}
 	grp->gr_ptr.gt_addrinfo = ai;
 	while (ai != NULL) {
 		if (ai->ai_canonname == NULL) {
 			if (getnameinfo(ai->ai_addr, ai->ai_addrlen, host,
 			    sizeof host, NULL, 0, NI_NUMERICHOST) != 0)
 				strlcpy(host, "?", sizeof(host));
 			ai->ai_canonname = strdup(host);
 			ai->ai_flags |= AI_CANONNAME;
 		}
 		if (debug)
 			fprintf(stderr, "got host %s\n", ai->ai_canonname);
 		/*
 		 * Sanity check: make sure we don't already have an entry
 		 * for this host in the grouplist.
 		 */
 		for (checkgrp = tgrp; checkgrp != NULL;
 		    checkgrp = checkgrp->gr_next) {
 			if (checkgrp->gr_type != GT_HOST)
 				continue;
 			for (tai = checkgrp->gr_ptr.gt_addrinfo; tai != NULL;
 			    tai = tai->ai_next) {
 				if (sacmp(tai->ai_addr, ai->ai_addr, NULL) != 0)
 					continue;
 				if (debug)
 					fprintf(stderr,
 					    "ignoring duplicate host %s\n",
 					    ai->ai_canonname);
 				grp->gr_type = GT_IGNORE;
 				return (0);
 			}
 		}
 		ai = ai->ai_next;
 	}
 	grp->gr_type = GT_HOST;
 	return (0);
 }
 
 /*
  * Free up an exports list component
  */
 static void
 free_exp(struct exportlist *ep)
 {
 
 	if (ep->ex_defdir) {
 		free_host(ep->ex_defdir->dp_hosts);
 		free((caddr_t)ep->ex_defdir);
 	}
 	if (ep->ex_fsdir)
 		free(ep->ex_fsdir);
 	if (ep->ex_indexfile)
 		free(ep->ex_indexfile);
 	free_dir(ep->ex_dirl);
 	free((caddr_t)ep);
 }
 
 /*
  * Free hosts.
  */
 static void
 free_host(struct hostlist *hp)
 {
 	struct hostlist *hp2;
 
 	while (hp) {
 		hp2 = hp;
 		hp = hp->ht_next;
 		free((caddr_t)hp2);
 	}
 }
 
 static struct hostlist *
 get_ht(void)
 {
 	struct hostlist *hp;
 
 	hp = (struct hostlist *)malloc(sizeof (struct hostlist));
 	if (hp == (struct hostlist *)NULL)
 		out_of_mem();
 	hp->ht_next = (struct hostlist *)NULL;
 	hp->ht_flag = 0;
 	return (hp);
 }
 
 /*
  * Out of memory, fatal
  */
 static void
 out_of_mem(void)
 {
 
 	syslog(LOG_ERR, "out of memory");
 	exit(2);
 }
 
 /*
  * Do the nmount() syscall with the update flag to push the export info into
  * the kernel.
  */
 static int
 do_mount(struct exportlist *ep, struct grouplist *grp, int exflags,
     struct xucred *anoncrp, char *dirp, int dirplen, struct statfs *fsb)
 {
 	struct statfs fsb1;
 	struct addrinfo *ai;
 	struct export_args *eap;
 	char errmsg[255];
 	char *cp;
 	int done;
 	char savedc;
 	struct iovec *iov;
 	int i, iovlen;
 	int ret;
 	struct nfsex_args nfsea;
 
 	eap = &nfsea.export;
 
 	cp = NULL;
 	savedc = '\0';
 	iov = NULL;
 	iovlen = 0;
 	ret = 0;
 
 	bzero(eap, sizeof (struct export_args));
 	bzero(errmsg, sizeof(errmsg));
 	eap->ex_flags = exflags;
 	eap->ex_anon = *anoncrp;
 	eap->ex_indexfile = ep->ex_indexfile;
 	if (grp->gr_type == GT_HOST)
 		ai = grp->gr_ptr.gt_addrinfo;
 	else
 		ai = NULL;
 	eap->ex_numsecflavors = ep->ex_numsecflavors;
 	for (i = 0; i < eap->ex_numsecflavors; i++)
 		eap->ex_secflavors[i] = ep->ex_secflavors[i];
 	if (eap->ex_numsecflavors == 0) {
 		eap->ex_numsecflavors = 1;
 		eap->ex_secflavors[0] = AUTH_SYS;
 	}
 	done = FALSE;
 
 	if (v4root_phase == 0) {
 		build_iovec(&iov, &iovlen, "fstype", NULL, 0);
 		build_iovec(&iov, &iovlen, "fspath", NULL, 0);
 		build_iovec(&iov, &iovlen, "from", NULL, 0);
 		build_iovec(&iov, &iovlen, "update", NULL, 0);
 		build_iovec(&iov, &iovlen, "export", eap,
 		    sizeof (struct export_args));
 		build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg));
 	}
 
 	while (!done) {
 		switch (grp->gr_type) {
 		case GT_HOST:
 			if (ai->ai_addr->sa_family == AF_INET6 && have_v6 == 0)
 				goto skip;
 			eap->ex_addr = ai->ai_addr;
 			eap->ex_addrlen = ai->ai_addrlen;
 			eap->ex_masklen = 0;
 			break;
 		case GT_NET:
 			if (grp->gr_ptr.gt_net.nt_net.ss_family == AF_INET6 &&
 			    have_v6 == 0)
 				goto skip;
 			eap->ex_addr =
 			    (struct sockaddr *)&grp->gr_ptr.gt_net.nt_net;
 			eap->ex_addrlen =
 			    ((struct sockaddr *)&grp->gr_ptr.gt_net.nt_net)->sa_len;
 			eap->ex_mask =
 			    (struct sockaddr *)&grp->gr_ptr.gt_net.nt_mask;
 			eap->ex_masklen = ((struct sockaddr *)&grp->gr_ptr.gt_net.nt_mask)->sa_len;
 			break;
 		case GT_DEFAULT:
 			eap->ex_addr = NULL;
 			eap->ex_addrlen = 0;
 			eap->ex_mask = NULL;
 			eap->ex_masklen = 0;
 			break;
 		case GT_IGNORE:
 			ret = 0;
 			goto error_exit;
 			break;
 		default:
 			syslog(LOG_ERR, "bad grouptype");
 			if (cp)
 				*cp = savedc;
 			ret = 1;
 			goto error_exit;
 		}
 
 		/*
 		 * For V4:, use the nfssvc() syscall, instead of mount().
 		 */
 		if (v4root_phase == 2) {
 			nfsea.fspec = v4root_dirpath;
 			if (nfssvc(NFSSVC_V4ROOTEXPORT, (caddr_t)&nfsea) < 0) {
 				syslog(LOG_ERR, "Exporting V4: failed");
 				return (2);
 			}
 		} else {
 			/*
 			 * XXX:
 			 * Maybe I should just use the fsb->f_mntonname path
 			 * instead of looping back up the dirp to the mount
 			 * point??
 			 * Also, needs to know how to export all types of local
 			 * exportable filesystems and not just "ufs".
 			 */
 			iov[1].iov_base = fsb->f_fstypename; /* "fstype" */
 			iov[1].iov_len = strlen(fsb->f_fstypename) + 1;
 			iov[3].iov_base = fsb->f_mntonname; /* "fspath" */
 			iov[3].iov_len = strlen(fsb->f_mntonname) + 1;
 			iov[5].iov_base = fsb->f_mntfromname; /* "from" */
 			iov[5].iov_len = strlen(fsb->f_mntfromname) + 1;
 			errmsg[0] = '\0';
 	
 			while (nmount(iov, iovlen, fsb->f_flags) < 0) {
 				if (cp)
 					*cp-- = savedc;
 				else
 					cp = dirp + dirplen - 1;
 				if (opt_flags & OP_QUIET) {
 					ret = 1;
 					goto error_exit;
 				}
 				if (errno == EPERM) {
 					if (debug)
 						warnx("can't change attributes for %s: %s",
 						    dirp, errmsg);
 					syslog(LOG_ERR,
 					   "can't change attributes for %s: %s",
 					    dirp, errmsg);
 					ret = 1;
 					goto error_exit;
 				}
 				if (opt_flags & OP_ALLDIRS) {
 					if (errno == EINVAL)
 						syslog(LOG_ERR,
 		"-alldirs requested but %s is not a filesystem mountpoint",
 						    dirp);
 					else
 						syslog(LOG_ERR,
 						    "could not remount %s: %m",
 						    dirp);
 					ret = 1;
 					goto error_exit;
 				}
 				/* back up over the last component */
 				while (*cp == '/' && cp > dirp)
 					cp--;
 				while (*(cp - 1) != '/' && cp > dirp)
 					cp--;
 				if (cp == dirp) {
 					if (debug)
 						warnx("mnt unsucc");
 					syslog(LOG_ERR, "can't export %s %s",
 					    dirp, errmsg);
 					ret = 1;
 					goto error_exit;
 				}
 				savedc = *cp;
 				*cp = '\0';
 				/*
 				 * Check that we're still on the same
 				 * filesystem.
 				 */
 				if (statfs(dirp, &fsb1) != 0 ||
 				    bcmp(&fsb1.f_fsid, &fsb->f_fsid,
 				    sizeof (fsb1.f_fsid)) != 0) {
 					*cp = savedc;
 					syslog(LOG_ERR,
 					    "can't export %s %s", dirp,
 					    errmsg);
 					ret = 1;
 					goto error_exit;
 				}
 			}
 		}
 
 		/*
 		 * For the experimental server:
 		 * If this is the public directory, get the file handle
 		 * and load it into the kernel via the nfssvc() syscall.
 		 */
 		if ((exflags & MNT_EXPUBLIC) != 0) {
 			fhandle_t fh;
 			char *public_name;
 
 			if (eap->ex_indexfile != NULL)
 				public_name = eap->ex_indexfile;
 			else
 				public_name = dirp;
 			if (getfh(public_name, &fh) < 0)
 				syslog(LOG_ERR,
 				    "Can't get public fh for %s", public_name);
 			else if (nfssvc(NFSSVC_PUBLICFH, (caddr_t)&fh) < 0)
 				syslog(LOG_ERR,
 				    "Can't set public fh for %s", public_name);
 			else
 				has_publicfh = 1;
 		}
 skip:
 		if (ai != NULL)
 			ai = ai->ai_next;
 		if (ai == NULL)
 			done = TRUE;
 	}
 	if (cp)
 		*cp = savedc;
 error_exit:
 	/* free strings allocated by strdup() in getmntopts.c */
 	if (iov != NULL) {
 		free(iov[0].iov_base); /* fstype */
 		free(iov[2].iov_base); /* fspath */
 		free(iov[4].iov_base); /* from */
 		free(iov[6].iov_base); /* update */
 		free(iov[8].iov_base); /* export */
 		free(iov[10].iov_base); /* errmsg */
 
 		/* free iov, allocated by realloc() */
 		free(iov);
 	}
 	return (ret);
 }
 
 /*
  * Translate a net address.
  *
  * If `maskflg' is nonzero, then `cp' is a netmask, not a network address.
  */
 static int
 get_net(char *cp, struct netmsk *net, int maskflg)
 {
 	struct netent *np = NULL;
 	char *name, *p, *prefp;
 	struct sockaddr_in sin;
 	struct sockaddr *sa = NULL;
 	struct addrinfo hints, *ai = NULL;
 	char netname[NI_MAXHOST];
 	long preflen;
 
 	p = prefp = NULL;
 	if ((opt_flags & OP_MASKLEN) && !maskflg) {
 		p = strchr(cp, '/');
 		*p = '\0';
 		prefp = p + 1;
 	}
 
 	/*
 	 * Check for a numeric address first. We wish to avoid
 	 * possible DNS lookups in getnetbyname().
 	 */
 	if (isxdigit(*cp) || *cp == ':') {
 		memset(&hints, 0, sizeof hints);
 		/* Ensure the mask and the network have the same family. */
 		if (maskflg && (opt_flags & OP_NET))
 			hints.ai_family = net->nt_net.ss_family;
 		else if (!maskflg && (opt_flags & OP_HAVEMASK))
 			hints.ai_family = net->nt_mask.ss_family;
 		else
 			hints.ai_family = AF_UNSPEC;
 		hints.ai_flags = AI_NUMERICHOST;
 		if (getaddrinfo(cp, NULL, &hints, &ai) == 0)
 			sa = ai->ai_addr;
 		if (sa != NULL && ai->ai_family == AF_INET) {
 			/*
 			 * The address in `cp' is really a network address, so
 			 * use inet_network() to re-interpret this correctly.
 			 * e.g. "127.1" means 127.1.0.0, not 127.0.0.1.
 			 */
 			bzero(&sin, sizeof sin);
 			sin.sin_family = AF_INET;
 			sin.sin_len = sizeof sin;
 			sin.sin_addr = inet_makeaddr(inet_network(cp), 0);
 			if (debug)
 				fprintf(stderr, "get_net: v4 addr %s\n",
 				    inet_ntoa(sin.sin_addr));
 			sa = (struct sockaddr *)&sin;
 		}
 	}
 	if (sa == NULL && (np = getnetbyname(cp)) != NULL) {
 		bzero(&sin, sizeof sin);
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof sin;
 		sin.sin_addr = inet_makeaddr(np->n_net, 0);
 		sa = (struct sockaddr *)&sin;
 	}
 	if (sa == NULL)
 		goto fail;
 
 	if (maskflg) {
 		/* The specified sockaddr is a mask. */
 		if (checkmask(sa) != 0)
 			goto fail;
 		bcopy(sa, &net->nt_mask, sa->sa_len);
 		opt_flags |= OP_HAVEMASK;
 	} else {
 		/* The specified sockaddr is a network address. */
 		bcopy(sa, &net->nt_net, sa->sa_len);
 
 		/* Get a network name for the export list. */
 		if (np) {
 			name = np->n_name;
 		} else if (getnameinfo(sa, sa->sa_len, netname, sizeof netname,
 		   NULL, 0, NI_NUMERICHOST) == 0) {
 			name = netname;
 		} else {
 			goto fail;
 		}
 		if ((net->nt_name = strdup(name)) == NULL)
 			out_of_mem();
 
 		/*
 		 * Extract a mask from either a "/<masklen>" suffix, or
 		 * from the class of an IPv4 address.
 		 */
 		if (opt_flags & OP_MASKLEN) {
 			preflen = strtol(prefp, NULL, 10);
 			if (preflen < 0L || preflen == LONG_MAX)
 				goto fail;
 			bcopy(sa, &net->nt_mask, sa->sa_len);
 			if (makemask(&net->nt_mask, (int)preflen) != 0)
 				goto fail;
 			opt_flags |= OP_HAVEMASK;
 			*p = '/';
 		} else if (sa->sa_family == AF_INET &&
 		    (opt_flags & OP_MASK) == 0) {
 			in_addr_t addr;
 
 			addr = ((struct sockaddr_in *)sa)->sin_addr.s_addr;
 			if (IN_CLASSA(addr))
 				preflen = 8;
 			else if (IN_CLASSB(addr))
 				preflen = 16;
 			else if (IN_CLASSC(addr))
 				preflen = 24;
 			else if (IN_CLASSD(addr))
 				preflen = 28;
 			else
 				preflen = 32;	/* XXX */
 
 			bcopy(sa, &net->nt_mask, sa->sa_len);
 			makemask(&net->nt_mask, (int)preflen);
 			opt_flags |= OP_HAVEMASK;
 		}
 	}
 
 	if (ai)
 		freeaddrinfo(ai);
 	return 0;
 
 fail:
 	if (ai)
 		freeaddrinfo(ai);
 	return 1;
 }
 
 /*
  * Parse out the next white space separated field
  */
 static void
 nextfield(char **cp, char **endcp)
 {
 	char *p;
 
 	p = *cp;
 	while (*p == ' ' || *p == '\t')
 		p++;
 	if (*p == '\n' || *p == '\0')
 		*cp = *endcp = p;
 	else {
 		*cp = p++;
 		while (*p != ' ' && *p != '\t' && *p != '\n' && *p != '\0')
 			p++;
 		*endcp = p;
 	}
 }
 
 /*
  * Get an exports file line. Skip over blank lines and handle line
  * continuations.
  */
 static int
 get_line(void)
 {
 	char *p, *cp;
 	size_t len;
 	int totlen, cont_line;
 
 	/*
 	 * Loop around ignoring blank lines and getting all continuation lines.
 	 */
 	p = line;
 	totlen = 0;
 	do {
 		if ((p = fgetln(exp_file, &len)) == NULL)
 			return (0);
 		cp = p + len - 1;
 		cont_line = 0;
 		while (cp >= p &&
 		    (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\\')) {
 			if (*cp == '\\')
 				cont_line = 1;
 			cp--;
 			len--;
 		}
 		if (cont_line) {
 			*++cp = ' ';
 			len++;
 		}
 		if (linesize < len + totlen + 1) {
 			linesize = len + totlen + 1;
 			line = realloc(line, linesize);
 			if (line == NULL)
 				out_of_mem();
 		}
 		memcpy(line + totlen, p, len);
 		totlen += len;
 		line[totlen] = '\0';
 	} while (totlen == 0 || cont_line);
 	return (1);
 }
 
 /*
  * Parse a description of a credential.
  */
 static void
 parsecred(char *namelist, struct xucred *cr)
 {
 	char *name;
 	int cnt;
 	char *names;
 	struct passwd *pw;
 	struct group *gr;
 	gid_t groups[XU_NGROUPS + 1];
 	int ngroups;
 
 	cr->cr_version = XUCRED_VERSION;
 	/*
 	 * Set up the unprivileged user.
 	 */
 	cr->cr_uid = -2;
 	cr->cr_groups[0] = -2;
 	cr->cr_ngroups = 1;
 	/*
 	 * Get the user's password table entry.
 	 */
 	names = strsep_quote(&namelist, " \t\n");
 	name = strsep(&names, ":");
 	/* Bug?  name could be NULL here */
 	if (isdigit(*name) || *name == '-')
 		pw = getpwuid(atoi(name));
 	else
 		pw = getpwnam(name);
 	/*
 	 * Credentials specified as those of a user.
 	 */
 	if (names == NULL) {
 		if (pw == NULL) {
 			syslog(LOG_ERR, "unknown user: %s", name);
 			return;
 		}
 		cr->cr_uid = pw->pw_uid;
 		ngroups = XU_NGROUPS + 1;
 		if (getgrouplist(pw->pw_name, pw->pw_gid, groups, &ngroups))
 			syslog(LOG_ERR, "too many groups");
 		/*
 		 * Compress out duplicate.
 		 */
 		cr->cr_ngroups = ngroups - 1;
 		cr->cr_groups[0] = groups[0];
 		for (cnt = 2; cnt < ngroups; cnt++)
 			cr->cr_groups[cnt - 1] = groups[cnt];
 		return;
 	}
 	/*
 	 * Explicit credential specified as a colon separated list:
 	 *	uid:gid:gid:...
 	 */
 	if (pw != NULL)
 		cr->cr_uid = pw->pw_uid;
 	else if (isdigit(*name) || *name == '-')
 		cr->cr_uid = atoi(name);
 	else {
 		syslog(LOG_ERR, "unknown user: %s", name);
 		return;
 	}
 	cr->cr_ngroups = 0;
 	while (names != NULL && *names != '\0' && cr->cr_ngroups < XU_NGROUPS) {
 		name = strsep(&names, ":");
 		if (isdigit(*name) || *name == '-') {
 			cr->cr_groups[cr->cr_ngroups++] = atoi(name);
 		} else {
 			if ((gr = getgrnam(name)) == NULL) {
 				syslog(LOG_ERR, "unknown group: %s", name);
 				continue;
 			}
 			cr->cr_groups[cr->cr_ngroups++] = gr->gr_gid;
 		}
 	}
 	if (names != NULL && *names != '\0' && cr->cr_ngroups == XU_NGROUPS)
 		syslog(LOG_ERR, "too many groups");
 }
 
 #define	STRSIZ	(MNTNAMLEN+MNTPATHLEN+50)
 /*
  * Routines that maintain the remote mounttab
  */
 static void
 get_mountlist(void)
 {
 	struct mountlist *mlp, **mlpp;
 	char *host, *dirp, *cp;
 	char str[STRSIZ];
 	FILE *mlfile;
 
 	if ((mlfile = fopen(_PATH_RMOUNTLIST, "r")) == NULL) {
 		if (errno == ENOENT)
 			return;
 		else {
 			syslog(LOG_ERR, "can't open %s", _PATH_RMOUNTLIST);
 			return;
 		}
 	}
 	mlpp = &mlhead;
 	while (fgets(str, STRSIZ, mlfile) != NULL) {
 		cp = str;
 		host = strsep(&cp, " \t\n");
 		dirp = strsep(&cp, " \t\n");
 		if (host == NULL || dirp == NULL)
 			continue;
 		mlp = (struct mountlist *)malloc(sizeof (*mlp));
 		if (mlp == (struct mountlist *)NULL)
 			out_of_mem();
 		strncpy(mlp->ml_host, host, MNTNAMLEN);
 		mlp->ml_host[MNTNAMLEN] = '\0';
 		strncpy(mlp->ml_dirp, dirp, MNTPATHLEN);
 		mlp->ml_dirp[MNTPATHLEN] = '\0';
 		mlp->ml_next = (struct mountlist *)NULL;
 		*mlpp = mlp;
 		mlpp = &mlp->ml_next;
 	}
 	fclose(mlfile);
 }
 
 static void
 del_mlist(char *hostp, char *dirp)
 {
 	struct mountlist *mlp, **mlpp;
 	struct mountlist *mlp2;
 	FILE *mlfile;
 	int fnd = 0;
 
 	mlpp = &mlhead;
 	mlp = mlhead;
 	while (mlp) {
 		if (!strcmp(mlp->ml_host, hostp) &&
 		    (!dirp || !strcmp(mlp->ml_dirp, dirp))) {
 			fnd = 1;
 			mlp2 = mlp;
 			*mlpp = mlp = mlp->ml_next;
 			free((caddr_t)mlp2);
 		} else {
 			mlpp = &mlp->ml_next;
 			mlp = mlp->ml_next;
 		}
 	}
 	if (fnd) {
 		if ((mlfile = fopen(_PATH_RMOUNTLIST, "w")) == NULL) {
 			syslog(LOG_ERR,"can't update %s", _PATH_RMOUNTLIST);
 			return;
 		}
 		mlp = mlhead;
 		while (mlp) {
 			fprintf(mlfile, "%s %s\n", mlp->ml_host, mlp->ml_dirp);
 			mlp = mlp->ml_next;
 		}
 		fclose(mlfile);
 	}
 }
 
 static void
 add_mlist(char *hostp, char *dirp)
 {
 	struct mountlist *mlp, **mlpp;
 	FILE *mlfile;
 
 	mlpp = &mlhead;
 	mlp = mlhead;
 	while (mlp) {
 		if (!strcmp(mlp->ml_host, hostp) && !strcmp(mlp->ml_dirp, dirp))
 			return;
 		mlpp = &mlp->ml_next;
 		mlp = mlp->ml_next;
 	}
 	mlp = (struct mountlist *)malloc(sizeof (*mlp));
 	if (mlp == (struct mountlist *)NULL)
 		out_of_mem();
 	strncpy(mlp->ml_host, hostp, MNTNAMLEN);
 	mlp->ml_host[MNTNAMLEN] = '\0';
 	strncpy(mlp->ml_dirp, dirp, MNTPATHLEN);
 	mlp->ml_dirp[MNTPATHLEN] = '\0';
 	mlp->ml_next = (struct mountlist *)NULL;
 	*mlpp = mlp;
 	if ((mlfile = fopen(_PATH_RMOUNTLIST, "a")) == NULL) {
 		syslog(LOG_ERR, "can't update %s", _PATH_RMOUNTLIST);
 		return;
 	}
 	fprintf(mlfile, "%s %s\n", mlp->ml_host, mlp->ml_dirp);
 	fclose(mlfile);
 }
 
 /*
  * Free up a group list.
  */
 static void
 free_grp(struct grouplist *grp)
 {
 	if (grp->gr_type == GT_HOST) {
 		if (grp->gr_ptr.gt_addrinfo != NULL)
 			freeaddrinfo(grp->gr_ptr.gt_addrinfo);
 	} else if (grp->gr_type == GT_NET) {
 		if (grp->gr_ptr.gt_net.nt_name)
 			free(grp->gr_ptr.gt_net.nt_name);
 	}
 	free((caddr_t)grp);
 }
 
 #ifdef DEBUG
 static void
 SYSLOG(int pri, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vfprintf(stderr, fmt, ap);
 	va_end(ap);
 }
 #endif /* DEBUG */
 
 /*
  * Check options for consistency.
  */
 static int
 check_options(struct dirlist *dp)
 {
 
 	if (v4root_phase == 0 && dp == NULL)
 	    return (1);
 	if ((opt_flags & (OP_MAPROOT | OP_MAPALL)) == (OP_MAPROOT | OP_MAPALL)) {
 	    syslog(LOG_ERR, "-mapall and -maproot mutually exclusive");
 	    return (1);
 	}
 	if ((opt_flags & OP_MASK) && (opt_flags & OP_NET) == 0) {
 		syslog(LOG_ERR, "-mask requires -network");
 		return (1);
 	}
 	if ((opt_flags & OP_NET) && (opt_flags & OP_HAVEMASK) == 0) {
 		syslog(LOG_ERR, "-network requires mask specification");
 		return (1);
 	}
 	if ((opt_flags & OP_MASK) && (opt_flags & OP_MASKLEN)) {
 		syslog(LOG_ERR, "-mask and /masklen are mutually exclusive");
 		return (1);
 	}
 	if (v4root_phase > 0 &&
 	    (opt_flags &
 	     ~(OP_SEC | OP_MASK | OP_NET | OP_HAVEMASK | OP_MASKLEN)) != 0) {
 	    syslog(LOG_ERR,"only -sec,-net,-mask options allowed on V4:");
 	    return (1);
 	}
 	if ((opt_flags & OP_ALLDIRS) && dp->dp_left) {
 	    syslog(LOG_ERR, "-alldirs has multiple directories");
 	    return (1);
 	}
 	return (0);
 }
 
 /*
  * Check an absolute directory path for any symbolic links. Return true
  */
 static int
 check_dirpath(char *dirp)
 {
 	char *cp;
 	int ret = 1;
 	struct stat sb;
 
 	cp = dirp + 1;
 	while (*cp && ret) {
 		if (*cp == '/') {
 			*cp = '\0';
 			if (lstat(dirp, &sb) < 0 || !S_ISDIR(sb.st_mode))
 				ret = 0;
 			*cp = '/';
 		}
 		cp++;
 	}
 	if (lstat(dirp, &sb) < 0 || !S_ISDIR(sb.st_mode))
 		ret = 0;
 	return (ret);
 }
 
 /*
  * Make a netmask according to the specified prefix length. The ss_family
  * and other non-address fields must be initialised before calling this.
  */
 static int
 makemask(struct sockaddr_storage *ssp, int bitlen)
 {
 	u_char *p;
 	int bits, i, len;
 
 	if ((p = sa_rawaddr((struct sockaddr *)ssp, &len)) == NULL)
 		return (-1);
 	if (bitlen > len * CHAR_BIT)
 		return (-1);
 
 	for (i = 0; i < len; i++) {
 		bits = MIN(CHAR_BIT, bitlen);
 		*p++ = (u_char)~0 << (CHAR_BIT - bits);
 		bitlen -= bits;
 	}
 	return 0;
 }
 
 /*
  * Check that the sockaddr is a valid netmask. Returns 0 if the mask
  * is acceptable (i.e. of the form 1...10....0).
  */
 static int
 checkmask(struct sockaddr *sa)
 {
 	u_char *mask;
 	int i, len;
 
 	if ((mask = sa_rawaddr(sa, &len)) == NULL)
 		return (-1);
 
 	for (i = 0; i < len; i++)
 		if (mask[i] != 0xff)
 			break;
 	if (i < len) {
 		if (~mask[i] & (u_char)(~mask[i] + 1))
 			return (-1);
 		i++;
 	}
 	for (; i < len; i++)
 		if (mask[i] != 0)
 			return (-1);
 	return (0);
 }
 
 /*
  * Compare two sockaddrs according to a specified mask. Return zero if
  * `sa1' matches `sa2' when filtered by the netmask in `samask'.
  * If samask is NULL, perform a full comparison.
  */
 static int
 sacmp(struct sockaddr *sa1, struct sockaddr *sa2, struct sockaddr *samask)
 {
 	unsigned char *p1, *p2, *mask;
 	int len, i;
 
 	if (sa1->sa_family != sa2->sa_family ||
 	    (p1 = sa_rawaddr(sa1, &len)) == NULL ||
 	    (p2 = sa_rawaddr(sa2, NULL)) == NULL)
 		return (1);
 
 	switch (sa1->sa_family) {
 	case AF_INET6:
 		if (((struct sockaddr_in6 *)sa1)->sin6_scope_id !=
 		    ((struct sockaddr_in6 *)sa2)->sin6_scope_id)
 			return (1);
 		break;
 	}
 
 	/* Simple binary comparison if no mask specified. */
 	if (samask == NULL)
 		return (memcmp(p1, p2, len));
 
 	/* Set up the mask, and do a mask-based comparison. */
 	if (sa1->sa_family != samask->sa_family ||
 	    (mask = sa_rawaddr(samask, NULL)) == NULL)
 		return (1);
 
 	for (i = 0; i < len; i++)
 		if ((p1[i] & mask[i]) != (p2[i] & mask[i]))
 			return (1);
 	return (0);
 }
 
 /*
  * Return a pointer to the part of the sockaddr that contains the
  * raw address, and set *nbytes to its length in bytes. Returns
  * NULL if the address family is unknown.
  */
 static void *
 sa_rawaddr(struct sockaddr *sa, int *nbytes) {
 	void *p;
 	int len;
 
 	switch (sa->sa_family) {
 	case AF_INET:
 		len = sizeof(((struct sockaddr_in *)sa)->sin_addr);
 		p = &((struct sockaddr_in *)sa)->sin_addr;
 		break;
 	case AF_INET6:
 		len = sizeof(((struct sockaddr_in6 *)sa)->sin6_addr);
 		p = &((struct sockaddr_in6 *)sa)->sin6_addr;
 		break;
 	default:
 		p = NULL;
 		len = 0;
 	}
 
 	if (nbytes != NULL)
 		*nbytes = len;
 	return (p);
 }
 
 static void
 huphandler(int sig __unused)
 {
 
 	got_sighup = 1;
 }
 
 static void
 terminate(int sig __unused)
 {
 	pidfile_remove(pfh);
 	rpcb_unset(MOUNTPROG, MOUNTVERS, NULL);
 	rpcb_unset(MOUNTPROG, MOUNTVERS3, NULL);
 	exit (0);
 }
Index: user/alc/PQ_LAUNDRY
===================================================================
--- user/alc/PQ_LAUNDRY	(revision 307895)
+++ user/alc/PQ_LAUNDRY	(revision 307896)

Property changes on: user/alc/PQ_LAUNDRY
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r307868-307894