Index: projects/building-blocks/sbin/ipfw/nat.c
===================================================================
--- projects/building-blocks/sbin/ipfw/nat.c	(revision 277723)
+++ projects/building-blocks/sbin/ipfw/nat.c	(revision 277724)
@@ -1,1115 +1,1115 @@
 /*
  * Copyright (c) 2002-2003 Luigi Rizzo
  * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
  * Copyright (c) 1994 Ugen J.S.Antsilevich
  *
  * Idea and grammar partially left from:
  * Copyright (c) 1993 Daniel Boulet
  *
  * Redistribution and use in source forms, with and without modification,
  * are permitted provided that this entire comment appears intact.
  *
  * Redistribution in binary form may occur without any restrictions.
  * Obviously, it would be nice if you gave credit where credit is due
  * but requiring it would be too onerous.
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  *
  * NEW command line interface for IP firewall facility
  *
  * $FreeBSD$
  *
  * In-kernel nat support
  */
 
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include "ipfw2.h"
 
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
 #include <netdb.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sysexits.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/route.h> /* def. of struct route */
 #include <netinet/in.h>
 #include <netinet/ip_fw.h>
 #include <arpa/inet.h>
 #include <alias.h>
 
 typedef int (nat_cb_t)(struct nat44_cfg_nat *cfg, void *arg);
 static void nat_show_cfg(struct nat44_cfg_nat *n, void *arg);
 static void nat_show_log(struct nat44_cfg_nat *n, void *arg);
 static int nat_show_data(struct nat44_cfg_nat *cfg, void *arg);
 static int natname_cmp(const void *a, const void *b);
 static int nat_foreach(nat_cb_t *f, void *arg, int sort);
 static int nat_get_cmd(char *name, uint16_t cmd, ipfw_obj_header **ooh);
 
 static struct _s_x nat_params[] = {
 	{ "ip",			TOK_IP },
 	{ "if",			TOK_IF },
  	{ "log",		TOK_ALOG },
  	{ "deny_in",		TOK_DENY_INC },
  	{ "same_ports",		TOK_SAME_PORTS },
  	{ "unreg_only",		TOK_UNREG_ONLY },
 	{ "skip_global",	TOK_SKIP_GLOBAL },
  	{ "reset",		TOK_RESET_ADDR },
  	{ "reverse",		TOK_ALIAS_REV },
  	{ "proxy_only",		TOK_PROXY_ONLY },
 	{ "redirect_addr",	TOK_REDIR_ADDR },
 	{ "redirect_port",	TOK_REDIR_PORT },
 	{ "redirect_proto",	TOK_REDIR_PROTO },
  	{ NULL, 0 }	/* terminator */
 };
 
 
 /*
  * Search for interface with name "ifn", and fill n accordingly:
  *
  * n->ip	ip address of interface "ifn"
  * n->if_name   copy of interface name "ifn"
  */
 static void
 set_addr_dynamic(const char *ifn, struct nat44_cfg_nat *n)
 {
 	size_t needed;
 	int mib[6];
 	char *buf, *lim, *next;
 	struct if_msghdr *ifm;
 	struct ifa_msghdr *ifam;
 	struct sockaddr_dl *sdl;
 	struct sockaddr_in *sin;
 	int ifIndex, ifMTU;
 
 	mib[0] = CTL_NET;
 	mib[1] = PF_ROUTE;
 	mib[2] = 0;
 	mib[3] = AF_INET;
 	mib[4] = NET_RT_IFLIST;
 	mib[5] = 0;
 /*
  * Get interface data.
  */
 	if (sysctl(mib, 6, NULL, &needed, NULL, 0) == -1)
 		err(1, "iflist-sysctl-estimate");
 	buf = safe_calloc(1, needed);
 	if (sysctl(mib, 6, buf, &needed, NULL, 0) == -1)
 		err(1, "iflist-sysctl-get");
 	lim = buf + needed;
 /*
  * Loop through interfaces until one with
  * given name is found. This is done to
  * find correct interface index for routing
  * message processing.
  */
 	ifIndex	= 0;
 	next = buf;
 	while (next < lim) {
 		ifm = (struct if_msghdr *)next;
 		next += ifm->ifm_msglen;
 		if (ifm->ifm_version != RTM_VERSION) {
 			if (co.verbose)
 				warnx("routing message version %d "
 				    "not understood", ifm->ifm_version);
 			continue;
 		}
 		if (ifm->ifm_type == RTM_IFINFO) {
 			sdl = (struct sockaddr_dl *)(ifm + 1);
 			if (strlen(ifn) == sdl->sdl_nlen &&
 			    strncmp(ifn, sdl->sdl_data, sdl->sdl_nlen) == 0) {
 				ifIndex = ifm->ifm_index;
 				ifMTU = ifm->ifm_data.ifi_mtu;
 				break;
 			}
 		}
 	}
 	if (!ifIndex)
 		errx(1, "unknown interface name %s", ifn);
 /*
  * Get interface address.
  */
 	sin = NULL;
 	while (next < lim) {
 		ifam = (struct ifa_msghdr *)next;
 		next += ifam->ifam_msglen;
 		if (ifam->ifam_version != RTM_VERSION) {
 			if (co.verbose)
 				warnx("routing message version %d "
 				    "not understood", ifam->ifam_version);
 			continue;
 		}
 		if (ifam->ifam_type != RTM_NEWADDR)
 			break;
 		if (ifam->ifam_addrs & RTA_IFA) {
 			int i;
 			char *cp = (char *)(ifam + 1);
 
 			for (i = 1; i < RTA_IFA; i <<= 1) {
 				if (ifam->ifam_addrs & i)
 					cp += SA_SIZE((struct sockaddr *)cp);
 			}
 			if (((struct sockaddr *)cp)->sa_family == AF_INET) {
 				sin = (struct sockaddr_in *)cp;
 				break;
 			}
 		}
 	}
 	if (sin == NULL)
-		errx(1, "%s: cannot get interface address", ifn);
-
-	n->ip = sin->sin_addr;
+		n->ip.s_addr = htonl(INADDR_ANY);
+	else
+		n->ip = sin->sin_addr;
 	strncpy(n->if_name, ifn, IF_NAMESIZE);
 
 	free(buf);
 }
 
 /*
  * XXX - The following functions, macros and definitions come from natd.c:
  * it would be better to move them outside natd.c, in a file
  * (redirect_support.[ch]?) shared by ipfw and natd, but for now i can live
  * with it.
  */
 
 /*
  * Definition of a port range, and macros to deal with values.
  * FORMAT:  HI 16-bits == first port in range, 0 == all ports.
  *	  LO 16-bits == number of ports in range
  * NOTES:   - Port values are not stored in network byte order.
  */
 
 #define port_range u_long
 
 #define GETLOPORT(x)	((x) >> 0x10)
 #define GETNUMPORTS(x)	((x) & 0x0000ffff)
 #define GETHIPORT(x)	(GETLOPORT((x)) + GETNUMPORTS((x)))
 
 /* Set y to be the low-port value in port_range variable x. */
 #define SETLOPORT(x,y)   ((x) = ((x) & 0x0000ffff) | ((y) << 0x10))
 
 /* Set y to be the number of ports in port_range variable x. */
 #define SETNUMPORTS(x,y) ((x) = ((x) & 0xffff0000) | (y))
 
 static void
 StrToAddr (const char* str, struct in_addr* addr)
 {
 	struct hostent* hp;
 
 	if (inet_aton (str, addr))
 		return;
 
 	hp = gethostbyname (str);
 	if (!hp)
 		errx (1, "unknown host %s", str);
 
 	memcpy (addr, hp->h_addr, sizeof (struct in_addr));
 }
 
 static int
 StrToPortRange (const char* str, const char* proto, port_range *portRange)
 {
 	char*	   sep;
 	struct servent*	sp;
 	char*		end;
 	u_short	 loPort;
 	u_short	 hiPort;
 
 	/* First see if this is a service, return corresponding port if so. */
 	sp = getservbyname (str,proto);
 	if (sp) {
 		SETLOPORT(*portRange, ntohs(sp->s_port));
 		SETNUMPORTS(*portRange, 1);
 		return 0;
 	}
 
 	/* Not a service, see if it's a single port or port range. */
 	sep = strchr (str, '-');
 	if (sep == NULL) {
 		SETLOPORT(*portRange, strtol(str, &end, 10));
 		if (end != str) {
 			/* Single port. */
 			SETNUMPORTS(*portRange, 1);
 			return 0;
 		}
 
 		/* Error in port range field. */
 		errx (EX_DATAERR, "%s/%s: unknown service", str, proto);
 	}
 
 	/* Port range, get the values and sanity check. */
 	sscanf (str, "%hu-%hu", &loPort, &hiPort);
 	SETLOPORT(*portRange, loPort);
 	SETNUMPORTS(*portRange, 0);	/* Error by default */
 	if (loPort <= hiPort)
 		SETNUMPORTS(*portRange, hiPort - loPort + 1);
 
 	if (GETNUMPORTS(*portRange) == 0)
 		errx (EX_DATAERR, "invalid port range %s", str);
 
 	return 0;
 }
 
 static int
 StrToProto (const char* str)
 {
 	if (!strcmp (str, "tcp"))
 		return IPPROTO_TCP;
 
 	if (!strcmp (str, "udp"))
 		return IPPROTO_UDP;
 
 	if (!strcmp (str, "sctp"))
 		return IPPROTO_SCTP;
 	errx (EX_DATAERR, "unknown protocol %s. Expected sctp, tcp or udp", str);
 }
 
 static int
 StrToAddrAndPortRange (const char* str, struct in_addr* addr, char* proto,
 			port_range *portRange)
 {
 	char*	ptr;
 
 	ptr = strchr (str, ':');
 	if (!ptr)
 		errx (EX_DATAERR, "%s is missing port number", str);
 
 	*ptr = '\0';
 	++ptr;
 
 	StrToAddr (str, addr);
 	return StrToPortRange (ptr, proto, portRange);
 }
 
 /* End of stuff taken from natd.c. */
 
 /*
  * The next 3 functions add support for the addr, port and proto redirect and
  * their logic is loosely based on SetupAddressRedirect(), SetupPortRedirect()
  * and SetupProtoRedirect() from natd.c.
  *
  * Every setup_* function fills at least one redirect entry
  * (struct nat44_cfg_redir) and zero or more server pool entry
  * (struct nat44_cfg_spool) in buf.
  *
  * The format of data in buf is:
  *
  *  nat44_cfg_nat nat44_cfg_redir nat44_cfg_spool    ......  nat44_cfg_spool
  *
  *    -------------------------------------        ------------
  *   |           | .....X ..... |          |         |           |  .....
  *    ------------------------------------- ...... ------------
  *                     ^
  *                spool_cnt       n=0       ......   n=(X-1)
  *
  * len points to the amount of available space in buf
  * space counts the memory consumed by every function
  *
  * XXX - Every function get all the argv params so it
  * has to check, in optional parameters, that the next
  * args is a valid option for the redir entry and not
  * another token. Only redir_port and redir_proto are
  * affected by this.
  */
 
 static int
 estimate_redir_addr(int *ac, char ***av)
 {
 	size_t space = sizeof(struct nat44_cfg_redir);
 	char *sep = **av;
 	u_int c = 0;
 
 	(void)ac;	/* UNUSED */
 	while ((sep = strchr(sep, ',')) != NULL) {
 		c++;
 		sep++;
 	}
 
 	if (c > 0)
 		c++;
 
 	space += c * sizeof(struct nat44_cfg_spool);
 
 	return (space);
 }
 
 static int
 setup_redir_addr(char *buf, int *ac, char ***av)
 {
 	struct nat44_cfg_redir *r;
 	char *sep;
 	size_t space;
 
 	r = (struct nat44_cfg_redir *)buf;
 	r->mode = REDIR_ADDR;
 	/* Skip nat44_cfg_redir at beginning of buf. */
 	buf = &buf[sizeof(struct nat44_cfg_redir)];
 	space = sizeof(struct nat44_cfg_redir);
 
 	/* Extract local address. */
 	if (strchr(**av, ',') != NULL) {
 		struct nat44_cfg_spool *spool;
 
 		/* Setup LSNAT server pool. */
 		r->laddr.s_addr = INADDR_NONE;
 		sep = strtok(**av, ",");
 		while (sep != NULL) {
 			spool = (struct nat44_cfg_spool *)buf;
 			space += sizeof(struct nat44_cfg_spool);
 			StrToAddr(sep, &spool->addr);
 			spool->port = ~0;
 			r->spool_cnt++;
 			/* Point to the next possible nat44_cfg_spool. */
 			buf = &buf[sizeof(struct nat44_cfg_spool)];
 			sep = strtok(NULL, ",");
 		}
 	} else
 		StrToAddr(**av, &r->laddr);
 	(*av)++; (*ac)--;
 
 	/* Extract public address. */
 	StrToAddr(**av, &r->paddr);
 	(*av)++; (*ac)--;
 
 	return (space);
 }
 
 static int
 estimate_redir_port(int *ac, char ***av)
 {
 	size_t space = sizeof(struct nat44_cfg_redir);
 	char *sep = **av;
 	u_int c = 0;
 
 	(void)ac;	/* UNUSED */
 	while ((sep = strchr(sep, ',')) != NULL) {
 		c++;
 		sep++;
 	}
 
 	if (c > 0)
 		c++;
 
 	space += c * sizeof(struct nat44_cfg_spool);
 
 	return (space);
 }
 
 static int
 setup_redir_port(char *buf, int *ac, char ***av)
 {
 	struct nat44_cfg_redir *r;
 	char *sep, *protoName, *lsnat = NULL;
 	size_t space;
 	u_short numLocalPorts;
 	port_range portRange;
 
 	numLocalPorts = 0;
 
 	r = (struct nat44_cfg_redir *)buf;
 	r->mode = REDIR_PORT;
 	/* Skip nat44_cfg_redir at beginning of buf. */
 	buf = &buf[sizeof(struct nat44_cfg_redir)];
 	space = sizeof(struct nat44_cfg_redir);
 
 	/*
 	 * Extract protocol.
 	 */
 	r->proto = StrToProto(**av);
 	protoName = **av;
 	(*av)++; (*ac)--;
 
 	/*
 	 * Extract local address.
 	 */
 	if (strchr(**av, ',') != NULL) {
 		r->laddr.s_addr = INADDR_NONE;
 		r->lport = ~0;
 		numLocalPorts = 1;
 		lsnat = **av;
 	} else {
 		/*
 		 * The sctp nat does not allow the port numbers to be mapped to
 		 * new port numbers. Therefore, no ports are to be specified
 		 * in the target port field.
 		 */
 		if (r->proto == IPPROTO_SCTP) {
 			if (strchr(**av, ':'))
 				errx(EX_DATAERR, "redirect_port:"
 				    "port numbers do not change in sctp, so do "
 				    "not specify them as part of the target");
 			else
 				StrToAddr(**av, &r->laddr);
 		} else {
 			if (StrToAddrAndPortRange(**av, &r->laddr, protoName,
 			    &portRange) != 0)
 				errx(EX_DATAERR, "redirect_port: "
 				    "invalid local port range");
 
 			r->lport = GETLOPORT(portRange);
 			numLocalPorts = GETNUMPORTS(portRange);
 		}
 	}
 	(*av)++; (*ac)--;
 
 	/*
 	 * Extract public port and optionally address.
 	 */
 	if (strchr(**av, ':') != NULL) {
 		if (StrToAddrAndPortRange(**av, &r->paddr, protoName,
 		    &portRange) != 0)
 			errx(EX_DATAERR, "redirect_port: "
 			    "invalid public port range");
 	} else {
 		r->paddr.s_addr = INADDR_ANY;
 		if (StrToPortRange(**av, protoName, &portRange) != 0)
 			errx(EX_DATAERR, "redirect_port: "
 			    "invalid public port range");
 	}
 
 	r->pport = GETLOPORT(portRange);
 	if (r->proto == IPPROTO_SCTP) { /* so the logic below still works */
 		numLocalPorts = GETNUMPORTS(portRange);
 		r->lport = r->pport;
 	}
 	r->pport_cnt = GETNUMPORTS(portRange);
 	(*av)++; (*ac)--;
 
 	/*
 	 * Extract remote address and optionally port.
 	 */
 	/*
 	 * NB: isdigit(**av) => we've to check that next parameter is really an
 	 * option for this redirect entry, else stop here processing arg[cv].
 	 */
 	if (*ac != 0 && isdigit(***av)) {
 		if (strchr(**av, ':') != NULL) {
 			if (StrToAddrAndPortRange(**av, &r->raddr, protoName,
 			    &portRange) != 0)
 				errx(EX_DATAERR, "redirect_port: "
 				    "invalid remote port range");
 		} else {
 			SETLOPORT(portRange, 0);
 			SETNUMPORTS(portRange, 1);
 			StrToAddr(**av, &r->raddr);
 		}
 		(*av)++; (*ac)--;
 	} else {
 		SETLOPORT(portRange, 0);
 		SETNUMPORTS(portRange, 1);
 		r->raddr.s_addr = INADDR_ANY;
 	}
 	r->rport = GETLOPORT(portRange);
 	r->rport_cnt = GETNUMPORTS(portRange);
 
 	/*
 	 * Make sure port ranges match up, then add the redirect ports.
 	 */
 	if (numLocalPorts != r->pport_cnt)
 		errx(EX_DATAERR, "redirect_port: "
 		    "port ranges must be equal in size");
 
 	/* Remote port range is allowed to be '0' which means all ports. */
 	if (r->rport_cnt != numLocalPorts &&
 	    (r->rport_cnt != 1 || r->rport != 0))
 		errx(EX_DATAERR, "redirect_port: remote port must"
 		    "be 0 or equal to local port range in size");
 
 	/* Setup LSNAT server pool. */
 	if (lsnat != NULL) {
 		struct nat44_cfg_spool *spool;
 
 		sep = strtok(lsnat, ",");
 		while (sep != NULL) {
 			spool = (struct nat44_cfg_spool *)buf;
 			space += sizeof(struct nat44_cfg_spool);
 			/*
 			 * The sctp nat does not allow the port numbers to
 			 * be mapped to new port numbers. Therefore, no ports
 			 * are to be specified in the target port field.
 			 */
 			if (r->proto == IPPROTO_SCTP) {
 				if (strchr (sep, ':')) {
 					errx(EX_DATAERR, "redirect_port:"
 					    "port numbers do not change in "
 					    "sctp, so do not specify them as "
 					    "part of the target");
 				} else {
 					StrToAddr(sep, &spool->addr);
 					spool->port = r->pport;
 				}
 			} else {
 				if (StrToAddrAndPortRange(sep, &spool->addr,
 					protoName, &portRange) != 0)
 					errx(EX_DATAERR, "redirect_port:"
 					    "invalid local port range");
 				if (GETNUMPORTS(portRange) != 1)
 					errx(EX_DATAERR, "redirect_port: "
 					    "local port must be single in "
 					    "this context");
 				spool->port = GETLOPORT(portRange);
 			}
 			r->spool_cnt++;
 			/* Point to the next possible nat44_cfg_spool. */
 			buf = &buf[sizeof(struct nat44_cfg_spool)];
 			sep = strtok(NULL, ",");
 		}
 	}
 
 	return (space);
 }
 
 static int
 setup_redir_proto(char *buf, int *ac, char ***av)
 {
 	struct nat44_cfg_redir *r;
 	struct protoent *protoent;
 	size_t space;
 
 	r = (struct nat44_cfg_redir *)buf;
 	r->mode = REDIR_PROTO;
 	/* Skip nat44_cfg_redir at beginning of buf. */
 	buf = &buf[sizeof(struct nat44_cfg_redir)];
 	space = sizeof(struct nat44_cfg_redir);
 
 	/*
 	 * Extract protocol.
 	 */
 	protoent = getprotobyname(**av);
 	if (protoent == NULL)
 		errx(EX_DATAERR, "redirect_proto: unknown protocol %s", **av);
 	else
 		r->proto = protoent->p_proto;
 
 	(*av)++; (*ac)--;
 
 	/*
 	 * Extract local address.
 	 */
 	StrToAddr(**av, &r->laddr);
 
 	(*av)++; (*ac)--;
 
 	/*
 	 * Extract optional public address.
 	 */
 	if (*ac == 0) {
 		r->paddr.s_addr = INADDR_ANY;
 		r->raddr.s_addr = INADDR_ANY;
 	} else {
 		/* see above in setup_redir_port() */
 		if (isdigit(***av)) {
 			StrToAddr(**av, &r->paddr);
 			(*av)++; (*ac)--;
 
 			/*
 			 * Extract optional remote address.
 			 */
 			/* see above in setup_redir_port() */
 			if (*ac != 0 && isdigit(***av)) {
 				StrToAddr(**av, &r->raddr);
 				(*av)++; (*ac)--;
 			}
 		}
 	}
 
 	return (space);
 }
 
 static void
 nat_show_log(struct nat44_cfg_nat *n, void *arg)
 {
 	char *buf;
 
 	buf = (char *)(n + 1);
 	if (buf[0] != '\0')
 		printf("nat %s: %s\n", n->name, buf);
 }
 
 static void
 nat_show_cfg(struct nat44_cfg_nat *n, void *arg)
 {
 	int i, cnt, flag, off;
 	struct nat44_cfg_redir *t;
 	struct nat44_cfg_spool *s;
 	caddr_t buf;
 	struct protoent *p;
 
 	buf = (caddr_t)n;
 	flag = 1;
 	off = sizeof(*n);
 	printf("ipfw nat %s config", n->name);
 	if (strlen(n->if_name) != 0)
 		printf(" if %s", n->if_name);
 	else if (n->ip.s_addr != 0)
 		printf(" ip %s", inet_ntoa(n->ip));
 	while (n->mode != 0) {
 		if (n->mode & PKT_ALIAS_LOG) {
 			printf(" log");
 			n->mode &= ~PKT_ALIAS_LOG;
 		} else if (n->mode & PKT_ALIAS_DENY_INCOMING) {
 			printf(" deny_in");
 			n->mode &= ~PKT_ALIAS_DENY_INCOMING;
 		} else if (n->mode & PKT_ALIAS_SAME_PORTS) {
 			printf(" same_ports");
 			n->mode &= ~PKT_ALIAS_SAME_PORTS;
 		} else if (n->mode & PKT_ALIAS_SKIP_GLOBAL) {
 			printf(" skip_global");
 			n->mode &= ~PKT_ALIAS_SKIP_GLOBAL;
 		} else if (n->mode & PKT_ALIAS_UNREGISTERED_ONLY) {
 			printf(" unreg_only");
 			n->mode &= ~PKT_ALIAS_UNREGISTERED_ONLY;
 		} else if (n->mode & PKT_ALIAS_RESET_ON_ADDR_CHANGE) {
 			printf(" reset");
 			n->mode &= ~PKT_ALIAS_RESET_ON_ADDR_CHANGE;
 		} else if (n->mode & PKT_ALIAS_REVERSE) {
 			printf(" reverse");
 			n->mode &= ~PKT_ALIAS_REVERSE;
 		} else if (n->mode & PKT_ALIAS_PROXY_ONLY) {
 			printf(" proxy_only");
 			n->mode &= ~PKT_ALIAS_PROXY_ONLY;
 		}
 	}
 	/* Print all the redirect's data configuration. */
 	for (cnt = 0; cnt < n->redir_cnt; cnt++) {
 		t = (struct nat44_cfg_redir *)&buf[off];
 		off += sizeof(struct nat44_cfg_redir);
 		switch (t->mode) {
 		case REDIR_ADDR:
 			printf(" redirect_addr");
 			if (t->spool_cnt == 0)
 				printf(" %s", inet_ntoa(t->laddr));
 			else
 				for (i = 0; i < t->spool_cnt; i++) {
 					s = (struct nat44_cfg_spool *)&buf[off];
 					if (i)
 						printf(",");
 					else
 						printf(" ");
 					printf("%s", inet_ntoa(s->addr));
 					off += sizeof(struct nat44_cfg_spool);
 				}
 			printf(" %s", inet_ntoa(t->paddr));
 			break;
 		case REDIR_PORT:
 			p = getprotobynumber(t->proto);
 			printf(" redirect_port %s ", p->p_name);
 			if (!t->spool_cnt) {
 				printf("%s:%u", inet_ntoa(t->laddr), t->lport);
 				if (t->pport_cnt > 1)
 					printf("-%u", t->lport +
 					    t->pport_cnt - 1);
 			} else
 				for (i=0; i < t->spool_cnt; i++) {
 					s = (struct nat44_cfg_spool *)&buf[off];
 					if (i)
 						printf(",");
 					printf("%s:%u", inet_ntoa(s->addr),
 					    s->port);
 					off += sizeof(struct nat44_cfg_spool);
 				}
 
 			printf(" ");
 			if (t->paddr.s_addr)
 				printf("%s:", inet_ntoa(t->paddr));
 			printf("%u", t->pport);
 			if (!t->spool_cnt && t->pport_cnt > 1)
 				printf("-%u", t->pport + t->pport_cnt - 1);
 
 			if (t->raddr.s_addr) {
 				printf(" %s", inet_ntoa(t->raddr));
 				if (t->rport) {
 					printf(":%u", t->rport);
 					if (!t->spool_cnt && t->rport_cnt > 1)
 						printf("-%u", t->rport +
 						    t->rport_cnt - 1);
 				}
 			}
 			break;
 		case REDIR_PROTO:
 			p = getprotobynumber(t->proto);
 			printf(" redirect_proto %s %s", p->p_name,
 			    inet_ntoa(t->laddr));
 			if (t->paddr.s_addr != 0) {
 				printf(" %s", inet_ntoa(t->paddr));
 				if (t->raddr.s_addr)
 					printf(" %s", inet_ntoa(t->raddr));
 			}
 			break;
 		default:
 			errx(EX_DATAERR, "unknown redir mode");
 			break;
 		}
 	}
 	printf("\n");
 }
 
 void
 ipfw_config_nat(int ac, char **av)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *n;		/* Nat instance configuration. */
 	int i, off, tok, ac1;
 	char *id, *buf, **av1, *end;
 	size_t len;
 
 	av++;
 	ac--;
 	/* Nat id. */
 	if (ac == 0)
 		errx(EX_DATAERR, "missing nat id");
 	id = *av;
 	i = (int)strtol(id, &end, 0);
 	if (i <= 0 || *end != '\0')
 		errx(EX_DATAERR, "illegal nat id: %s", id);
 	av++;
 	ac--;
 	if (ac == 0)
 		errx(EX_DATAERR, "missing option");
 
 	len = sizeof(*oh) + sizeof(*n);
 	ac1 = ac;
 	av1 = av;
 	while (ac1 > 0) {
 		tok = match_token(nat_params, *av1);
 		ac1--;
 		av1++;
 		switch (tok) {
 		case TOK_IP:
 		case TOK_IF:
 			ac1--;
 			av1++;
 			break;
 		case TOK_ALOG:
 		case TOK_DENY_INC:
 		case TOK_SAME_PORTS:
 		case TOK_SKIP_GLOBAL:
 		case TOK_UNREG_ONLY:
 		case TOK_RESET_ADDR:
 		case TOK_ALIAS_REV:
 		case TOK_PROXY_ONLY:
 			break;
 		case TOK_REDIR_ADDR:
 			if (ac1 < 2)
 				errx(EX_DATAERR, "redirect_addr: "
 				    "not enough arguments");
 			len += estimate_redir_addr(&ac1, &av1);
 			av1 += 2;
 			ac1 -= 2;
 			break;
 		case TOK_REDIR_PORT:
 			if (ac1 < 3)
 				errx(EX_DATAERR, "redirect_port: "
 				    "not enough arguments");
 			av1++;
 			ac1--;
 			len += estimate_redir_port(&ac1, &av1);
 			av1 += 2;
 			ac1 -= 2;
 			/* Skip optional remoteIP/port */
 			if (ac1 != 0 && isdigit(**av1)) {
 				av1++;
 				ac1--;
 			}
 			break;
 		case TOK_REDIR_PROTO:
 			if (ac1 < 2)
 				errx(EX_DATAERR, "redirect_proto: "
 				    "not enough arguments");
 			len += sizeof(struct nat44_cfg_redir);
 			av1 += 2;
 			ac1 -= 2;
 			/* Skip optional remoteIP/port */
 			if (ac1 != 0 && isdigit(**av1)) {
 				av1++;
 				ac1--;
 			}
 			if (ac1 != 0 && isdigit(**av1)) {
 				av1++;
 				ac1--;
 			}
 			break;
 		default:
 			errx(EX_DATAERR, "unrecognised option ``%s''", av1[-1]);
 		}
 	}
 
 	if ((buf = malloc(len)) == NULL)
 		errx(EX_OSERR, "malloc failed");
 
 	/* Offset in buf: save space for header at the beginning. */
 	off = sizeof(*oh) + sizeof(*n);
 	memset(buf, 0, len);
 	oh = (ipfw_obj_header *)buf;
 	n = (struct nat44_cfg_nat *)(oh + 1);
 	oh->ntlv.head.length = sizeof(oh->ntlv);
 	snprintf(oh->ntlv.name, sizeof(oh->ntlv.name), "%d", i);
 	snprintf(n->name, sizeof(n->name), "%d", i);
 
 	while (ac > 0) {
 		tok = match_token(nat_params, *av);
 		ac--;
 		av++;
 		switch (tok) {
 		case TOK_IP:
 			if (ac == 0)
 				errx(EX_DATAERR, "missing option");
 			if (!inet_aton(av[0], &(n->ip)))
 				errx(EX_DATAERR, "bad ip address ``%s''",
 				    av[0]);
 			ac--;
 			av++;
 			break;
 		case TOK_IF:
 			if (ac == 0)
 				errx(EX_DATAERR, "missing option");
 			set_addr_dynamic(av[0], n);
 			ac--;
 			av++;
 			break;
 		case TOK_ALOG:
 			n->mode |= PKT_ALIAS_LOG;
 			break;
 		case TOK_DENY_INC:
 			n->mode |= PKT_ALIAS_DENY_INCOMING;
 			break;
 		case TOK_SAME_PORTS:
 			n->mode |= PKT_ALIAS_SAME_PORTS;
 			break;
 		case TOK_UNREG_ONLY:
 			n->mode |= PKT_ALIAS_UNREGISTERED_ONLY;
 			break;
 		case TOK_SKIP_GLOBAL:
 			n->mode |= PKT_ALIAS_SKIP_GLOBAL;
 			break;
 		case TOK_RESET_ADDR:
 			n->mode |= PKT_ALIAS_RESET_ON_ADDR_CHANGE;
 			break;
 		case TOK_ALIAS_REV:
 			n->mode |= PKT_ALIAS_REVERSE;
 			break;
 		case TOK_PROXY_ONLY:
 			n->mode |= PKT_ALIAS_PROXY_ONLY;
 			break;
 			/*
 			 * All the setup_redir_* functions work directly in
 			 * the final buffer, see above for details.
 			 */
 		case TOK_REDIR_ADDR:
 		case TOK_REDIR_PORT:
 		case TOK_REDIR_PROTO:
 			switch (tok) {
 			case TOK_REDIR_ADDR:
 				i = setup_redir_addr(&buf[off], &ac, &av);
 				break;
 			case TOK_REDIR_PORT:
 				i = setup_redir_port(&buf[off], &ac, &av);
 				break;
 			case TOK_REDIR_PROTO:
 				i = setup_redir_proto(&buf[off], &ac, &av);
 				break;
 			}
 			n->redir_cnt++;
 			off += i;
 			break;
 		}
 	}
 
 	i = do_set3(IP_FW_NAT44_XCONFIG, &oh->opheader, len);
 	if (i != 0)
 		err(1, "setsockopt(%s)", "IP_FW_NAT44_XCONFIG");
 
 	if (!co.do_quiet) {
 		/* After every modification, we show the resultant rule. */
 		int _ac = 3;
 		const char *_av[] = {"show", "config", id};
 		ipfw_show_nat(_ac, (char **)(void *)_av);
 	}
 }
 
 struct nat_list_arg {
 	uint16_t	cmd;
 	int		is_all;
 };
 
 static int
 nat_show_data(struct nat44_cfg_nat *cfg, void *arg)
 {
 	struct nat_list_arg *nla;
 	ipfw_obj_header *oh;
 
 	nla = (struct nat_list_arg *)arg;
 
 	switch (nla->cmd) {
 	case IP_FW_NAT44_XGETCONFIG:
 		if (nat_get_cmd(cfg->name, nla->cmd, &oh) != 0) {
 			warnx("Error getting nat instance %s info", cfg->name);
 			break;
 		}
 		nat_show_cfg((struct nat44_cfg_nat *)(oh + 1), NULL);
 		free(oh);
 		break;
 	case IP_FW_NAT44_XGETLOG:
 		if (nat_get_cmd(cfg->name, nla->cmd, &oh) == 0) {
 			nat_show_log((struct nat44_cfg_nat *)(oh + 1), NULL);
 			free(oh);
 			break;
 		}
 		/* Handle error */
 		if (nla->is_all != 0 && errno == ENOENT)
 			break;
 		warn("Error getting nat instance %s info", cfg->name);
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Compare nat names.
  * Honor number comparison.
  */
 static int
 natname_cmp(const void *a, const void *b)
 {
 	struct nat44_cfg_nat *ia, *ib;
 
 	ia = (struct nat44_cfg_nat *)a;
 	ib = (struct nat44_cfg_nat *)b;
 
 	return (stringnum_cmp(ia->name, ib->name));
 }
 
 /*
  * Retrieves nat list from kernel,
  * optionally sorts it and calls requested function for each table.
  * Returns 0 on success.
  */
 static int
 nat_foreach(nat_cb_t *f, void *arg, int sort)
 {
 	ipfw_obj_lheader *olh;
 	struct nat44_cfg_nat *cfg;
 	size_t sz;
 	int i, error;
 
 	/* Start with reasonable default */
 	sz = sizeof(*olh) + 16 * sizeof(struct nat44_cfg_nat);
 
 	for (;;) {
 		if ((olh = calloc(1, sz)) == NULL)
 			return (ENOMEM);
 
 		olh->size = sz;
 		if (do_get3(IP_FW_NAT44_LIST_NAT, &olh->opheader, &sz) != 0) {
 			free(olh);
 			if (errno == ENOMEM) {
 				sz = olh->size;
 				continue;
 			}
 			return (errno);
 		}
 
 		if (sort != 0)
 			qsort(olh + 1, olh->count, olh->objsize, natname_cmp);
 
 		cfg = (struct nat44_cfg_nat*)(olh + 1);
 		for (i = 0; i < olh->count; i++) {
 			error = f(cfg, arg); /* Ignore errors for now */
 			cfg = (struct nat44_cfg_nat *)((caddr_t)cfg +
 			    olh->objsize);
 		}
 
 		free(olh);
 		break;
 	}
 
 	return (0);
 }
 
 static int
 nat_get_cmd(char *name, uint16_t cmd, ipfw_obj_header **ooh)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *cfg;
 	size_t sz;
 
 	/* Start with reasonable default */
 	sz = sizeof(*oh) + sizeof(*cfg) + 128;
 
 	for (;;) {
 		if ((oh = calloc(1, sz)) == NULL)
 			return (ENOMEM);
 		cfg = (struct nat44_cfg_nat *)(oh + 1);
 		oh->ntlv.head.length = sizeof(oh->ntlv);
 		strlcpy(oh->ntlv.name, name, sizeof(oh->ntlv.name));
 		strlcpy(cfg->name, name, sizeof(cfg->name));
 
 		if (do_get3(cmd, &oh->opheader, &sz) != 0) {
 			sz = cfg->size;
 			free(oh);
 			if (errno == ENOMEM)
 				continue;
 			return (errno);
 		}
 
 		*ooh = oh;
 		break;
 	}
 
 	return (0);
 }
 
 void
 ipfw_show_nat(int ac, char **av)
 {
 	ipfw_obj_header *oh;
 	char *name;
 	int cmd;
 	struct nat_list_arg nla;
 
 	ac--;
 	av++;
 
 	if (co.test_only)
 		return;
 
 	/* Parse parameters. */
 	cmd = 0; /* XXX: Change to IP_FW_NAT44_XGETLOG @ MFC */
 	name = NULL;
 	for ( ; ac != 0; ac--, av++) {
 		if (!strncmp(av[0], "config", strlen(av[0]))) {
 			cmd = IP_FW_NAT44_XGETCONFIG;
 			continue;
 		}
 		if (strcmp(av[0], "log") == 0) {
 			cmd = IP_FW_NAT44_XGETLOG;
 			continue;
 		}
 		if (name != NULL)
 			err(EX_USAGE,"only one instance name may be specified");
 		name = av[0];
 	}
 
 	if (cmd == 0)
 		errx(EX_USAGE, "Please specify action. Available: config,log");
 
 	if (name == NULL) {
 		memset(&nla, 0, sizeof(nla));
 		nla.cmd = cmd;
 		nla.is_all = 1;
 		nat_foreach(nat_show_data, &nla, 1);
 	} else {
 		if (nat_get_cmd(name, cmd, &oh) != 0)
 			err(EX_OSERR, "Error getting nat %s instance info", name);
 		nat_show_cfg((struct nat44_cfg_nat *)(oh + 1), NULL);
 		free(oh);
 	}
 }
 
Index: projects/building-blocks/sbin/ipfw
===================================================================
--- projects/building-blocks/sbin/ipfw	(revision 277723)
+++ projects/building-blocks/sbin/ipfw	(revision 277724)

Property changes on: projects/building-blocks/sbin/ipfw
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sbin/ipfw:r276595-277723
Index: projects/building-blocks/sbin
===================================================================
--- projects/building-blocks/sbin	(revision 277723)
+++ projects/building-blocks/sbin	(revision 277724)

Property changes on: projects/building-blocks/sbin
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sbin:r277689-277723
Index: projects/building-blocks/sys/amd64/amd64/machdep.c
===================================================================
--- projects/building-blocks/sys/amd64/amd64/machdep.c	(revision 277723)
+++ projects/building-blocks/sys/amd64/amd64/machdep.c	(revision 277724)
@@ -1,2808 +1,2824 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_perfmon.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/efi.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <x86/isa/icu.h>
 #else
 #include <x86/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 #include <x86/init.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /* Preload data parse function */
 static caddr_t native_parse_preload_data(u_int64_t);
 
 /* Native function to fetch and parse the e820 map */
 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
 
 /* Default init_ops implementation. */
 struct init_ops init_ops = {
 	.parse_preload_data =	native_parse_preload_data,
 	.early_clock_source_init =	i8254_init,
 	.early_delay =			i8254_delay,
 	.parse_memmap =			native_parse_memmap,
 #ifdef SMP
 	.mp_bootaddress =		mp_bootaddress,
 	.start_all_aps =		native_start_all_aps,
 #endif
 	.msi_init =			msi_init,
 };
 
 /*
  * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its value is
  * the physical address at which the kernel is loaded.
  */
 extern char kernphys[];
 
 struct msgbuf *msgbufp;
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_gdt, r_idt;
 
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 void (*vmm_resume_p)(void);
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge. 
 	 */
 	sysenv = kern_getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = kern_getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)vm_cnt.v_free_count),
 	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by call
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct pcb *pcb;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	pcb = td->td_pcb;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct pcb *pcb;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	pcb = td->td_pcb;
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0) {
 		uprintf("pid %d (%s): sigreturn copyin failed\n",
 		    p->p_pid, td->td_name);
 		return (error);
 	}
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
 		    td->td_name, rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
 		    td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    p->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
 		xfpustate = __builtin_alloca(xfpustate_len);
 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    p->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    p->p_pid, td->td_name, ret);
 		return (ret);
 	}
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
  
 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	uint64_t acnt, mcnt, perf;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 
 	/*
 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 	 * DELAY(9) based logic fails.
 	 */
 	if (tsc_is_invariant && !tsc_perf_stat)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	if (tsc_is_invariant) {
 		wrmsr(MSR_MPERF, 0);
 		wrmsr(MSR_APERF, 0);
 		tsc1 = rdtsc();
 		DELAY(1000);
 		mcnt = rdmsr(MSR_MPERF);
 		acnt = rdmsr(MSR_APERF);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		perf = 1000 * acnt / mcnt;
 		*rate = (tsc2 - tsc1) * perf;
 	} else {
 		tsc1 = rdtsc();
 		DELAY(1000);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		*rate = (tsc2 - tsc1) * 1000;
 	}
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		halt();
 }
 
 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 static void
 cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
 		cpu_idle_hook(sbt);
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
 	 * an interrupt fires during that critical section we may have
 	 * a pending preemption.  If the CPU halts, then that thread
 	 * may not execute until a later interrupt awakens the CPU.
 	 * To handle this race, check for a runnable thread after
 	 * disabling interrupts and immediately return if one is
 	 * found.  Also, we must absolutely guarentee that hlt is
 	 * the next instruction after sti.  This ensures that any
 	 * interrupt that fires after the call to disable_intr() will
 	 * immediately awaken the CPU from hlt.  Finally, please note
 	 * that on x86 this works fine because of interrupts enabled only
 	 * after the instruction following sti takes place, while IF is set
 	 * to 1 immediately, allowing hlt instruction to acknowledge the
 	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 /*
  * MWAIT cpu power states.  Lower 4 bits are sub-states.
  */
 #define	MWAIT_C0	0xf0
 #define	MWAIT_C1	0x00
 #define	MWAIT_C2	0x10
 #define	MWAIT_C3	0x20
 #define	MWAIT_C4	0x30
 
 static void
 cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_MWAIT;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable()) {
 		enable_intr();
 		*state = STATE_RUNNING;
 		return;
 	}
 	cpu_monitor(state, 0, 0);
 	if (*state == STATE_MWAIT)
 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 	else
 		enable_intr();
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_RUNNING;
 
 	/*
 	 * The sched_runnable() call is racy but as long as there is
 	 * a loop missing it one time will have just a little impact if any
 	 * (and it is much better than missing the check at all).
 	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 /*
  * C1E renders the local APIC timer dead, so we disable it by
  * reading the Interrupt Pending Message register and clearing
  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  * 
  * Reference:
  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  *   #32559 revision 3.00+
  */
 #define	MSR_AMDK8_IPM		0xc0010055
 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 
 static void
 cpu_probe_amdc1e(void)
 {
 
 	/*
 	 * Detect the presence of C1E capability mostly on latest
 	 * dual-cores (or future) k8 family.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
 		cpu_ident_amdc1e = 1;
 	}
 }
 
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 
 void
 cpu_idle(int busy)
 {
 	uint64_t msr;
 	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #ifdef MP_WATCHDOG
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		sbt = cpu_idleclock();
 	}
 
 	/* Apply AMD APIC timer C1E workaround. */
 	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 	}
 
 	/* Call main idle method. */
 	cpu_idle_fn(sbt);
 
 	/* Switch timers back into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 out:
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct pcpu *pcpu;
 	int *state;
 
 	pcpu = pcpu_find(cpu);
 	state = (int *)pcpu->pc_monitorbuf;
 	/*
 	 * This doesn't need to be atomic since missing the race will
 	 * simply result in unnecessary IPIs.
 	 */
 	if (*state == STATE_SLEEPING)
 		return (0);
 	if (*state == STATE_MWAIT)
 		*state = STATE_RUNNING;
 	return (1);
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 struct {
 	void	*id_fn;
 	char	*id_name;
 } idle_tbl[] = {
 	{ cpu_idle_spin, "spin" },
 	{ cpu_idle_mwait, "mwait" },
 	{ cpu_idle_hlt, "hlt" },
 	{ cpu_idle_acpi, "acpi" },
 	{ NULL, NULL }
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 
 static int
 idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int error;
 	char *p;
 	int i;
 
 	p = "unknown";
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, buf))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
     idle_sysctl, "A", "currently selected idle function");
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	mtx_lock(&dt_lock);
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 	else
 		mtx_unlock(&dt_lock);
 	
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	clear_pcb_flags(pcb, PCB_32BIT);
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	td->td_retval[1] = 0;
 
 	/*
 	 * Reset the hardware debug registers if they were in use.
 	 * They won't have any meaning for the newly exec'd process.
 	 */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		pcb->pcb_dr0 = 0;
 		pcb->pcb_dr1 = 0;
 		pcb->pcb_dr2 = 0;
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
 		if (pcb == curpcb) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 			reset_dbregs();
 		}
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
 struct amd64tss common_tss[MAXCPU];
 
 /*
  * Software prototypes -- in more palatable form.
  *
  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  * slots as corresponding segments for i386 kernel.
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GNULL2_SEL	1 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
 	.ssd_type = SDT_SYSTSS,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* Actually, the TSS is a system descriptor which is double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 LDT Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 
 void
 setidt(idx, func, typ, dpl, ist)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int ist;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall),
 #endif
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	struct {
 		uint16_t limit;
 		uint64_t base;
 	} __packed idtr, gdtr;
 	uint16_t ldt, tr;
 
 	__asm __volatile("sidt %0" : "=m" (idtr));
 	db_printf("idtr\t0x%016lx/%04x\n",
 	    (u_long)idtr.base, (u_int)idtr.limit);
 	__asm __volatile("sgdt %0" : "=m" (gdtr));
 	db_printf("gdtr\t0x%016lx/%04x\n",
 	    (u_long)gdtr.base, (u_int)gdtr.limit);
 	__asm __volatile("sldt %0" : "=r" (ldt));
 	db_printf("ldtr\t0x%04x\n", ldt);
 	__asm __volatile("str %0" : "=r" (tr));
 	db_printf("tr\t0x%04x\n", tr);
 	db_printf("cr0\t0x%016lx\n", rcr0());
 	db_printf("cr2\t0x%016lx\n", rcr2());
 	db_printf("cr3\t0x%016lx\n", rcr3());
 	db_printf("cr4\t0x%016lx\n", rcr4());
 	db_printf("EFER\t%016lx\n", rdmsr(MSR_EFER));
 	db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL));
 	db_printf("DEBUG_CTL\t%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
 	db_printf("PAT\t%016lx\n", rdmsr(MSR_PAT));
 	db_printf("GSBASE\t%016lx\n", rdmsr(MSR_GSBASE));
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
 #include <isa/isavar.h>
 #include <isa/isareg.h>
 /*
  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  * and is only suitable for use at probe time.
  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  * It shouldn't be here.  There should probably be an APIC centric
  * implementation in the apic driver code, if at all.
  */
 intrmask_t
 isa_irq_pending(void)
 {
 	u_char irr1;
 	u_char irr2;
 
 	irr1 = inb(IO_ICU1);
 	irr2 = inb(IO_ICU2);
 	return ((irr2 << 8) | irr1);
 }
 #endif
 
 u_int basemem;
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 */
 	insert_idx = physmap_idx + 2;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = physmap_idx; i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 void
 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
                       vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap, *smapend;
 
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (!add_physmap_entry(smap->base, smap->length, physmap,
 		    physmap_idx))
 			break;
 	}
 }
 
 #define efi_next_descriptor(ptr, size) \
 	((struct efi_md *)(((uint8_t *) ptr) + size))
 
 static void
 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
     int *physmap_idx)
 {
 	struct efi_md *map, *p;
 	const char *type;
 	size_t efisz;
 	int ndesc, i;
 
 	static const char *types[] = {
 		"Reserved",
 		"LoaderCode",
 		"LoaderData",
 		"BootServicesCode",
 		"BootServicesData",
 		"RuntimeServicesCode",
 		"RuntimeServicesData",
 		"ConventionalMemory",
 		"UnusableMemory",
 		"ACPIReclaimMemory",
 		"ACPIMemoryNVS",
 		"MemoryMappedIO",
 		"MemoryMappedIOPortSpace",
 		"PalCode"
 	};
 
 	/*
 	 * Memory map data provided by UEFI via the GetMemoryMap
 	 * Boot Services API.
 	 */
 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 	map = (struct efi_md *)((uint8_t *)efihdr + efisz); 
 
 	if (efihdr->descriptor_size == 0)
 		return;
 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
 
 	if (boothowto & RB_VERBOSE)
 		printf("%23s %12s %12s %8s %4s\n",
 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
 
 	for (i = 0, p = map; i < ndesc; i++,
 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 		if (boothowto & RB_VERBOSE) {
 			if (p->md_type <= EFI_MD_TYPE_PALCODE)
 				type = types[p->md_type];
 			else
 				type = "<INVALID>";
 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 			    p->md_virt, p->md_pages);
 			if (p->md_attr & EFI_MD_ATTR_UC)
 				printf("UC ");
 			if (p->md_attr & EFI_MD_ATTR_WC)
 				printf("WC ");
 			if (p->md_attr & EFI_MD_ATTR_WT)
 				printf("WT ");
 			if (p->md_attr & EFI_MD_ATTR_WB)
 				printf("WB ");
 			if (p->md_attr & EFI_MD_ATTR_UCE)
 				printf("UCE ");
 			if (p->md_attr & EFI_MD_ATTR_WP)
 				printf("WP ");
 			if (p->md_attr & EFI_MD_ATTR_RP)
 				printf("RP ");
 			if (p->md_attr & EFI_MD_ATTR_XP)
 				printf("XP ");
 			if (p->md_attr & EFI_MD_ATTR_RT)
 				printf("RUNTIME");
 			printf("\n");
 		}
 
 		switch (p->md_type) {
 		case EFI_MD_TYPE_CODE:
 		case EFI_MD_TYPE_DATA:
 		case EFI_MD_TYPE_BS_CODE:
 		case EFI_MD_TYPE_BS_DATA:
 		case EFI_MD_TYPE_FREE:
 			/*
 			 * We're allowed to use any entry with these types.
 			 */
 			break;
 		default:
 			continue;
 		}
 
 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 		    physmap, physmap_idx))
 			break;
 	}
 }
 
 static char bootmethod[16] = "";
 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
     "System firmware boot method");
 
 static void
 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap;
 	struct efi_map_header *efihdr;
 	u_int32_t size;
 
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	smap = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (efihdr == NULL && smap == NULL)
 		panic("No BIOS smap or EFI map info from loader!");
 
 	if (efihdr != NULL) {
 		add_efi_map_entries(efihdr, physmap, physmap_idx);
 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 	} else {
 		size = *((u_int32_t *)smap - 1);
 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 	}
 }
 
+#define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
+
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	u_long physmem_start, physmem_tunable, memtest;
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
+	int page_counter;
 
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 	physmap_idx = 0;
 
 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] == 0x00000000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0)
 		panic("BIOS smap did not include a basemem segment!");
 
 	/*
 	 * Make hole for "AP -> long mode" bootstrap code.  The
 	 * mp_bootaddress vector is only available when the kernel
 	 * is configured to support APs and APs for the system start
 	 * in 32bit mode (e.g. SMP bare metal).
 	 */
 	if (init_ops.mp_bootaddress)
 		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * The boot memory test is disabled by default, as it takes a
 	 * significant amount of time on large-memory systems, and is
 	 * unfriendly to virtual machines as it unnecessarily touches all
 	 * pages.
 	 *
 	 * A general name is used as the code may be extended to support
 	 * additional tests beyond the current "page present" test.
 	 */
 	memtest = 0;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 *
 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 	 * By default, mask off the first 16 pages unless we appear to be
 	 * running in a VM.
 	 */
 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 	if (physmem_start < PAGE_SIZE)
 		physmap[0] = PAGE_SIZE;
 	else if (physmem_start >= physmap[1])
 		physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 	else
 		physmap[0] = round_page(physmem_start);
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
+	page_counter = 0;
+	if (memtest != 0)
+		printf("Testing system memory");
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= (vm_paddr_t)kernphys && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
+			 * Print a "." every GB to show we're making
+			 * progress.
+			 */
+			page_counter++;
+			if ((page_counter % PAGES_PER_GB) == 0)
+				printf(".");
+
+			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
+	if (memtest != 0)
+		printf("\n");
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 }
 
 static caddr_t
 native_parse_preload_data(u_int64_t modulep)
 {
 	caddr_t kmdp;
 #ifdef DDB
 	vm_offset_t ksym_start;
 	vm_offset_t ksym_end;
 #endif
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 	db_fetch_ksymtab(ksym_start, ksym_end);
 #endif
 
 	return (kmdp);
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct xstate_hdr *xhdr;
 	u_int64_t msr;
 	char *env;
 	size_t kstack0_sz;
 
 	thread0.td_kstack = physfree + KERNBASE;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
 	physfree += kstack0_sz;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	kmdp = init_ops.parse_preload_data(modulep);
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * make gdt memory segments
 	 */
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (long) gdt;
 	lgdt(&r_gdt);
 	pc = &__pcpu[0];
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	dpcpu_init((void *)(physfree + KERNBASE), 0);
 	physfree += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(tssp, &common_tss[0]);
 	PCPU_SET(commontssp, &common_tss[0]);
 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the clock before the console so that console
 	 * initialization can use DELAY().
 	 */
 	clock_init();
 
 	/*
 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 	 * transition).
 	 */
 	if (kmdp != NULL && preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL)
 		vty_set_preferred(VTY_VT);
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?";
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS,
 		    "Boot flags requested debugger");
 #endif
 
 	identify_cpu();		/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 	initializecpucache();
 
 	/* doublefault stack space, runs on ist1 */
 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 
 	/*
 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
 	 * above the start of the ist2 stack.
 	 */
 	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist2 = (long) np;
 
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	/* Set up the fast syscall stuff */
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
 	/*
 	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 	 * area size.  Zero out the extended state header in fpu save
 	 * area.
 	 */
 	thread0.td_pcb = get_pcb_td(&thread0);
 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 		    1);
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
 	/* Ensure the stack is aligned to 16 bytes */
 	common_tss[0].tss_rsp0 &= ~0xFul;
 	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */
 	thread0.td_frame = &proc0_tf;
 
         env = kern_getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	cpu_probe_amdc1e();
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 
 	/* Location of kernel stack for locore */
 	return ((u_int64_t)thread0.td_pcb);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 static int
 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct bios_smap *smapbase;
 	struct bios_smap_xattr smap;
 	caddr_t kmdp;
 	uint32_t *smapattr;
 	int count, error, i;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		return (0);
 	smapattr = (uint32_t *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 	error = 0;
 	for (i = 0; i < count; i++) {
 		smap.base = smapbase[i].base;
 		smap.length = smapbase[i].length;
 		smap.type = smapbase[i].type;
 		if (smapattr != NULL)
 			smap.xattr = smapattr[i];
 		else
 			smap.xattr = 0;
 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
 	}
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 
 static int
 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct efi_map_header *efihdr;
 	caddr_t kmdp;
 	uint32_t efisize;
 
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr == NULL)
 		return (0);
 	efisize = *((uint32_t *)efihdr - 1);
 	return (SYSCTL_OUT(req, efihdr, efisize));
 }
 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	critical_exit();
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(flags);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = tf->tf_rsp;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 
 	td->td_frame->tf_rip = addr;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	if (tp->tf_flags & TF_HASSEGS) {
 		regs->r_ds = tp->tf_ds;
 		regs->r_es = tp->tf_es;
 		regs->r_fs = tp->tf_fs;
 		regs->r_gs = tp->tf_gs;
 	} else {
 		regs->r_ds = 0;
 		regs->r_es = 0;
 		regs->r_fs = 0;
 		regs->r_gs = 0;
 	}
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	if (0) {	/* XXXKIB */
 		tp->tf_ds = regs->r_ds;
 		tp->tf_es = regs->r_es;
 		tp->tf_fs = regs->r_fs;
 		tp->tf_gs = regs->r_gs;
 		tp->tf_flags = TF_HASSEGS;
 	}
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	fpugetregs(td);
 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 	fpuuserinited(td);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_flags = tp->tf_flags;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	tp->tf_flags = mcp->mc_flags;
 	if (tp->tf_flags & TF_HASSEGS) {
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_gs = mcp->mc_gs;
 	}
 	if (mcp->mc_flags & _MC_HASBASES) {
 		pcb->pcb_fsbase = mcp->mc_fsbase;
 		pcb->pcb_gsbase = mcp->mc_gsbase;
 	}
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	struct savefpu *fpstate;
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		fpstate = (struct savefpu *)&mcp->mc_fpstate;
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 		error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 	 * have too many layers.
 	 */
 	clear_pcb_flags(curthread->td_pcb,
 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		set_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: projects/building-blocks/sys/arm/ti/am335x/am335x_lcd.c
===================================================================
--- projects/building-blocks/sys/arm/ti/am335x/am335x_lcd.c	(revision 277723)
+++ projects/building-blocks/sys/arm/ti/am335x/am335x_lcd.c	(revision 277724)
@@ -1,714 +1,759 @@
 /*-
  * Copyright 2013 Oleksandr Tymoshenko <gonzo@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_syscons.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/clock.h>
 #include <sys/time.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resource.h>
 #include <sys/rman.h>
 #include <sys/sysctl.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
-
-/* syscons bits */
 #include <sys/fbio.h>
 #include <sys/consio.h>
 
 #include <machine/bus.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <dev/fb/fbreg.h>
+#ifdef DEV_SC
 #include <dev/syscons/syscons.h>
+#else /* VT */
+#include <dev/vt/vt.h>
+#endif
 
 #include <arm/ti/ti_prcm.h>
 #include <arm/ti/ti_scm.h>
 
 #include "am335x_lcd.h"
 #include "am335x_pwm.h"
 
+#include "fb_if.h"
+
 #define	LCD_PID			0x00
 #define	LCD_CTRL		0x04
 #define		CTRL_DIV_MASK		0xff
 #define		CTRL_DIV_SHIFT		8
 #define		CTRL_AUTO_UFLOW_RESTART	(1 << 1)
 #define		CTRL_RASTER_MODE	1
 #define		CTRL_LIDD_MODE		0
 #define	LCD_LIDD_CTRL		0x0C
 #define	LCD_LIDD_CS0_CONF	0x10
 #define	LCD_LIDD_CS0_ADDR	0x14
 #define	LCD_LIDD_CS0_DATA	0x18
 #define	LCD_LIDD_CS1_CONF	0x1C
 #define	LCD_LIDD_CS1_ADDR	0x20
 #define	LCD_LIDD_CS1_DATA	0x24
 #define	LCD_RASTER_CTRL		0x28
 #define		RASTER_CTRL_TFT24_UNPACKED	(1 << 26)
 #define		RASTER_CTRL_TFT24		(1 << 25)
 #define		RASTER_CTRL_STN565		(1 << 24)
 #define		RASTER_CTRL_TFTPMAP		(1 << 23)
 #define		RASTER_CTRL_NIBMODE		(1 << 22)
 #define		RASTER_CTRL_PALMODE_SHIFT	20
 #define		PALETTE_PALETTE_AND_DATA	0x00
 #define		PALETTE_PALETTE_ONLY		0x01
 #define		PALETTE_DATA_ONLY		0x02
 #define		RASTER_CTRL_REQDLY_SHIFT	12
 #define		RASTER_CTRL_MONO8B		(1 << 9)
 #define		RASTER_CTRL_RBORDER		(1 << 8)
 #define		RASTER_CTRL_LCDTFT		(1 << 7)
 #define		RASTER_CTRL_LCDBW		(1 << 1)
 #define		RASTER_CTRL_LCDEN		(1 << 0)
 #define	LCD_RASTER_TIMING_0	0x2C
 #define		RASTER_TIMING_0_HBP_SHIFT	24
 #define		RASTER_TIMING_0_HFP_SHIFT	16
 #define		RASTER_TIMING_0_HSW_SHIFT	10
 #define		RASTER_TIMING_0_PPLLSB_SHIFT	4
 #define		RASTER_TIMING_0_PPLMSB_SHIFT	3
 #define	LCD_RASTER_TIMING_1	0x30
 #define		RASTER_TIMING_1_VBP_SHIFT	24
 #define		RASTER_TIMING_1_VFP_SHIFT	16
 #define		RASTER_TIMING_1_VSW_SHIFT	10
 #define		RASTER_TIMING_1_LPP_SHIFT	0
 #define	LCD_RASTER_TIMING_2	0x34
 #define		RASTER_TIMING_2_HSWHI_SHIFT	27
 #define		RASTER_TIMING_2_LPP_B10_SHIFT	26
 #define		RASTER_TIMING_2_PHSVS		(1 << 25)
 #define		RASTER_TIMING_2_PHSVS_RISE	(1 << 24)
 #define		RASTER_TIMING_2_PHSVS_FALL	(0 << 24)
 #define		RASTER_TIMING_2_IOE		(1 << 23)
 #define		RASTER_TIMING_2_IPC		(1 << 22)
 #define		RASTER_TIMING_2_IHS		(1 << 21)
 #define		RASTER_TIMING_2_IVS		(1 << 20)
 #define		RASTER_TIMING_2_ACBI_SHIFT	16
 #define		RASTER_TIMING_2_ACB_SHIFT	8
 #define		RASTER_TIMING_2_HBPHI_SHIFT	4
 #define		RASTER_TIMING_2_HFPHI_SHIFT	0
 #define	LCD_RASTER_SUBPANEL	0x38
 #define	LCD_RASTER_SUBPANEL2	0x3C
 #define	LCD_LCDDMA_CTRL		0x40
 #define		LCDDMA_CTRL_DMA_MASTER_PRIO_SHIFT		16
 #define		LCDDMA_CTRL_TH_FIFO_RDY_SHIFT	8
 #define		LCDDMA_CTRL_BURST_SIZE_SHIFT	4
 #define		LCDDMA_CTRL_BYTES_SWAP		(1 << 3)
 #define		LCDDMA_CTRL_BE			(1 << 1)
 #define		LCDDMA_CTRL_FB0_ONLY		0
 #define		LCDDMA_CTRL_FB0_FB1		(1 << 0)
 #define	LCD_LCDDMA_FB0_BASE	0x44
 #define	LCD_LCDDMA_FB0_CEILING	0x48
 #define	LCD_LCDDMA_FB1_BASE	0x4C
 #define	LCD_LCDDMA_FB1_CEILING	0x50
 #define	LCD_SYSCONFIG		0x54
 #define		SYSCONFIG_STANDBY_FORCE		(0 << 4)
 #define		SYSCONFIG_STANDBY_NONE		(1 << 4)
 #define		SYSCONFIG_STANDBY_SMART		(2 << 4)
 #define		SYSCONFIG_IDLE_FORCE		(0 << 2)
 #define		SYSCONFIG_IDLE_NONE		(1 << 2)
 #define		SYSCONFIG_IDLE_SMART		(2 << 2)
 #define	LCD_IRQSTATUS_RAW	0x58
 #define	LCD_IRQSTATUS		0x5C
 #define	LCD_IRQENABLE_SET	0x60
 #define	LCD_IRQENABLE_CLEAR	0x64
 #define		IRQ_EOF1		(1 << 9)
 #define		IRQ_EOF0		(1 << 8)
 #define		IRQ_PL			(1 << 6)
 #define		IRQ_FUF			(1 << 5)
 #define		IRQ_ACB			(1 << 3)
 #define		IRQ_SYNC_LOST		(1 << 2)
 #define		IRQ_RASTER_DONE		(1 << 1)
 #define		IRQ_FRAME_DONE		(1 << 0)
 #define	LCD_END_OF_INT_IND	0x68
 #define	LCD_CLKC_ENABLE		0x6C
 #define		CLKC_ENABLE_DMA		(1 << 2)
 #define		CLKC_ENABLE_LDID	(1 << 1)
 #define		CLKC_ENABLE_CORE	(1 << 0)
 #define	LCD_CLKC_RESET		0x70
 #define		CLKC_RESET_MAIN		(1 << 3)
 #define		CLKC_RESET_DMA		(1 << 2)
 #define		CLKC_RESET_LDID		(1 << 1)
 #define		CLKC_RESET_CORE		(1 << 0)
 
 #define	LCD_LOCK(_sc)		mtx_lock(&(_sc)->sc_mtx)
 #define	LCD_UNLOCK(_sc)		mtx_unlock(&(_sc)->sc_mtx)
 #define	LCD_LOCK_INIT(_sc)	mtx_init(&(_sc)->sc_mtx, \
     device_get_nameunit(_sc->sc_dev), "am335x_lcd", MTX_DEF)
 #define	LCD_LOCK_DESTROY(_sc)	mtx_destroy(&(_sc)->sc_mtx);
 
 #define	LCD_READ4(_sc, reg)	bus_read_4((_sc)->sc_mem_res, reg);
 #define	LCD_WRITE4(_sc, reg, value)	\
     bus_write_4((_sc)->sc_mem_res, reg, value);
 
 
 /* Backlight is controlled by eCAS interface on PWM unit 0 */
 #define	PWM_UNIT	0
 #define	PWM_PERIOD	100
 
 struct am335x_lcd_softc {
 	device_t		sc_dev;
+	struct fb_info		sc_fb_info;
 	struct resource		*sc_mem_res;
 	struct resource		*sc_irq_res;
 	void			*sc_intr_hl;
 	struct mtx		sc_mtx;
 	int			sc_backlight;
 	struct sysctl_oid	*sc_oid;
 
 	/* Framebuffer */
 	bus_dma_tag_t		sc_dma_tag;
 	bus_dmamap_t		sc_dma_map;
 	size_t			sc_fb_size;
 	bus_addr_t		sc_fb_phys;
 	uint8_t			*sc_fb_base;
 };
 
 static void
 am335x_fb_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 {
 	bus_addr_t *addr;
 
 	if (err)
 		return;
 
 	addr = (bus_addr_t*)arg;
 	*addr = segs[0].ds_addr;
 }
 
 static uint32_t
 am335x_lcd_calc_divisor(uint32_t reference, uint32_t freq)
 {
 	uint32_t div;
 	/* Raster mode case: divisors are in range from 2 to 255 */
 	for (div = 2; div < 255; div++)
 		if (reference/div <= freq)
 			return (div);
 
 	return (255);
 }
 
 static int
 am335x_lcd_sysctl_backlight(SYSCTL_HANDLER_ARGS)
 {
 	struct am335x_lcd_softc *sc = (struct am335x_lcd_softc*)arg1;
 	int error;
 	int backlight;
        
 	backlight = sc->sc_backlight;
 	error = sysctl_handle_int(oidp, &backlight, 0, req);
 
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (backlight < 0)
 		backlight = 0;
 	if (backlight > 100)
 		backlight = 100;
 
 	LCD_LOCK(sc);
 	error = am335x_pwm_config_ecas(PWM_UNIT, PWM_PERIOD,
 	    backlight*PWM_PERIOD/100);
 	if (error == 0)
 		sc->sc_backlight = backlight;
 	LCD_UNLOCK(sc);
 
 	return (error);
 }
 
 static int
 am335x_read_panel_property(device_t dev, const char *name, uint32_t *val)
 {
 	phandle_t node;
 	pcell_t cell;
 
 	node = ofw_bus_get_node(dev);
 	if ((OF_getprop(node, name, &cell, sizeof(cell))) <= 0) {
 		device_printf(dev, "missing '%s' attribute in LCD panel info\n",
 		    name);
 		return (ENXIO);
 	}
 
 	*val = fdt32_to_cpu(cell);
 
 	return (0);
 }
 
 static int
 am335x_read_panel_info(device_t dev, struct panel_info *panel)
 {
 	int error;
 
 	error = 0;
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_width", &panel->panel_width)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_height", &panel->panel_height)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_hfp", &panel->panel_hfp)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_hbp", &panel->panel_hbp)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_hsw", &panel->panel_hsw)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_vfp", &panel->panel_vfp)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_vbp", &panel->panel_vbp)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_vsw", &panel->panel_vsw)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_pxl_clk", &panel->panel_pxl_clk)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "panel_invert_pxl_clk", &panel->panel_invert_pxl_clk)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "ac_bias", &panel->ac_bias)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "ac_bias_intrpt", &panel->ac_bias_intrpt)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "dma_burst_sz", &panel->dma_burst_sz)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "bpp", &panel->bpp)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "fdd", &panel->fdd)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "invert_line_clock", &panel->invert_line_clock)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "invert_frm_clock", &panel->invert_frm_clock)))
 		goto out;
 
 	if ((error = am335x_read_panel_property(dev,
 	    "sync_edge", &panel->sync_edge)))
 		goto out;
 
 	error = am335x_read_panel_property(dev,
 	    "sync_ctrl", &panel->sync_ctrl);
 
 out:
 	return (error);
 }
 
 static void
 am335x_lcd_intr(void *arg)
 {
 	struct am335x_lcd_softc *sc = arg;
 	uint32_t reg; 
 
 	reg = LCD_READ4(sc, LCD_IRQSTATUS);
 	LCD_WRITE4(sc, LCD_IRQSTATUS, reg);
 	/* Read value back to make sure it reached the hardware */
 	reg = LCD_READ4(sc, LCD_IRQSTATUS);
 
 	if (reg & IRQ_SYNC_LOST) {
 		reg = LCD_READ4(sc, LCD_RASTER_CTRL);
 		reg &= ~RASTER_CTRL_LCDEN;
 		LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); 
 
 		reg = LCD_READ4(sc, LCD_RASTER_CTRL);
 		reg |= RASTER_CTRL_LCDEN;
 		LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); 
 		goto done;
 	}
 
 	if (reg & IRQ_PL) {
 		reg = LCD_READ4(sc, LCD_RASTER_CTRL);
 		reg &= ~RASTER_CTRL_LCDEN;
 		LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); 
 
 		reg = LCD_READ4(sc, LCD_RASTER_CTRL);
 		reg |= RASTER_CTRL_LCDEN;
 		LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); 
 		goto done;
 	}
 
 	if (reg & IRQ_EOF0) {
 		LCD_WRITE4(sc, LCD_LCDDMA_FB0_BASE, sc->sc_fb_phys); 
 		LCD_WRITE4(sc, LCD_LCDDMA_FB0_CEILING, sc->sc_fb_phys + sc->sc_fb_size - 1); 
 		reg &= ~IRQ_EOF0;
 	}
 
 	if (reg & IRQ_EOF1) {
 		LCD_WRITE4(sc, LCD_LCDDMA_FB1_BASE, sc->sc_fb_phys); 
 		LCD_WRITE4(sc, LCD_LCDDMA_FB1_CEILING, sc->sc_fb_phys + sc->sc_fb_size - 1); 
 		reg &= ~IRQ_EOF1;
 	}
 
 	if (reg & IRQ_FUF) {
 		/* TODO: Handle FUF */
 	}
 
 	if (reg & IRQ_ACB) {
 		/* TODO: Handle ACB */
 	}
 
 done:
 	LCD_WRITE4(sc, LCD_END_OF_INT_IND, 0);
 	/* Read value back to make sure it reached the hardware */
 	reg = LCD_READ4(sc, LCD_END_OF_INT_IND);
 }
 
 static int
 am335x_lcd_probe(device_t dev)
 {
+#ifdef DEV_SC
 	int err;
+#endif
 
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
 	if (!ofw_bus_is_compatible(dev, "ti,am335x-lcd"))
 		return (ENXIO);
 
 	device_set_desc(dev, "AM335x LCD controller");
 
+#ifdef DEV_SC
 	err = sc_probe_unit(device_get_unit(dev), 
 	    device_get_flags(dev) | SC_AUTODETECT_KBD);
 	if (err != 0)
 		return (err);
+#endif
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 am335x_lcd_attach(device_t dev)
 {
 	struct am335x_lcd_softc *sc;
 	int rid;
 	int div;
 	struct panel_info panel;
 	uint32_t reg, timing0, timing1, timing2;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *tree;
 	uint32_t burst_log;
 	int err;
 	size_t dma_size;
 	uint32_t hbp, hfp, hsw;
 	uint32_t vbp, vfp, vsw;
 	uint32_t width, height;
 
 	sc = device_get_softc(dev);
 	sc->sc_dev = dev;
 
 	if (am335x_read_panel_info(dev, &panel))
 		return (ENXIO);
 
 	int ref_freq = 0;
 	ti_prcm_clk_enable(LCDC_CLK);
 	if (ti_prcm_clk_get_source_freq(LCDC_CLK, &ref_freq)) {
 		device_printf(dev, "Can't get reference frequency\n");
 		return (ENXIO);
 	}
 
 	rid = 0;
 	sc->sc_mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
 	    RF_ACTIVE);
 	if (!sc->sc_mem_res) {
 		device_printf(dev, "cannot allocate memory window\n");
 		return (ENXIO);
 	}
 
 	rid = 0;
 	sc->sc_irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
 	    RF_ACTIVE);
 	if (!sc->sc_irq_res) {
 		bus_release_resource(dev, SYS_RES_MEMORY, 0, sc->sc_mem_res);
 		device_printf(dev, "cannot allocate interrupt\n");
 		return (ENXIO);
 	}
 
 	if (bus_setup_intr(dev, sc->sc_irq_res, INTR_TYPE_MISC | INTR_MPSAFE,
 			NULL, am335x_lcd_intr, sc,
 			&sc->sc_intr_hl) != 0) {
 		bus_release_resource(dev, SYS_RES_IRQ, rid,
 		    sc->sc_irq_res);
 		bus_release_resource(dev, SYS_RES_MEMORY, rid,
 		    sc->sc_mem_res);
 		device_printf(dev, "Unable to setup the irq handler.\n");
 		return (ENXIO);
 	}
 
 	LCD_LOCK_INIT(sc);
 
 	/* Panle initialization */
 	dma_size = round_page(panel.panel_width*panel.panel_height*panel.bpp/8);
 
 	/*
 	 * Now allocate framebuffer memory
 	 */
 	err = bus_dma_tag_create(
 	    bus_get_dma_tag(dev),
 	    4, 0,		/* alignment, boundary */
 	    BUS_SPACE_MAXADDR_32BIT,	/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    dma_size, 1,			/* maxsize, nsegments */
 	    dma_size, 0,			/* maxsegsize, flags */
 	    NULL, NULL,			/* lockfunc, lockarg */
 	    &sc->sc_dma_tag);
 	if (err)
 		goto fail;
 
 	err = bus_dmamem_alloc(sc->sc_dma_tag, (void **)&sc->sc_fb_base,
 	    BUS_DMA_COHERENT, &sc->sc_dma_map);
 
 	if (err) {
 		device_printf(dev, "cannot allocate framebuffer\n");
 		goto fail;
 	}
 
 	err = bus_dmamap_load(sc->sc_dma_tag, sc->sc_dma_map, sc->sc_fb_base,
 	    dma_size, am335x_fb_dmamap_cb, &sc->sc_fb_phys, BUS_DMA_NOWAIT);
 
 	if (err) {
 		device_printf(dev, "cannot load DMA map\n");
 		goto fail;
 	}
 
 	/* Make sure it's blank */
 	memset(sc->sc_fb_base, 0x00, dma_size);
 
 	/* Calculate actual FB Size */
 	sc->sc_fb_size = panel.panel_width*panel.panel_height*panel.bpp/8;
 
 	/* Only raster mode is supported */
 	reg = CTRL_RASTER_MODE;
 	div = am335x_lcd_calc_divisor(ref_freq, panel.panel_pxl_clk);
 	reg |= (div << CTRL_DIV_SHIFT);
 	LCD_WRITE4(sc, LCD_CTRL, reg); 
 
 	/* Set timing */
 	timing0 = timing1 = timing2 = 0;
 
 	hbp = panel.panel_hbp - 1;
 	hfp = panel.panel_hfp - 1;
 	hsw = panel.panel_hsw - 1;
 
 	vbp = panel.panel_vbp;
 	vfp = panel.panel_vfp;
 	vsw = panel.panel_vsw - 1;
 
 	height = panel.panel_height - 1;
 	width = panel.panel_width - 1;
 
 	/* Horizontal back porch */
 	timing0 |= (hbp & 0xff) << RASTER_TIMING_0_HBP_SHIFT;
 	timing2 |= ((hbp >> 8) & 3) << RASTER_TIMING_2_HBPHI_SHIFT;
 	/* Horizontal front porch */
 	timing0 |= (hfp & 0xff) << RASTER_TIMING_0_HFP_SHIFT;
 	timing2 |= ((hfp >> 8) & 3) << RASTER_TIMING_2_HFPHI_SHIFT;
 	/* Horizontal sync width */
 	timing0 |= (hsw & 0x3f) << RASTER_TIMING_0_HSW_SHIFT;
 	timing2 |= ((hsw >> 6) & 0xf) << RASTER_TIMING_2_HSWHI_SHIFT;
 
 	/* Vertical back porch, front porch, sync width */
 	timing1 |= (vbp & 0xff) << RASTER_TIMING_1_VBP_SHIFT;
 	timing1 |= (vfp & 0xff) << RASTER_TIMING_1_VFP_SHIFT;
 	timing1 |= (vsw & 0x3f) << RASTER_TIMING_1_VSW_SHIFT;
 
 	/* Pixels per line */
 	timing0 |= ((width >> 10) & 1)
 	    << RASTER_TIMING_0_PPLMSB_SHIFT;
 	timing0 |= ((width >> 4) & 0x3f)
 	    << RASTER_TIMING_0_PPLLSB_SHIFT;
 
 	/* Lines per panel */
 	timing1 |= (height & 0x3ff) 
 	    << RASTER_TIMING_1_LPP_SHIFT;
 	timing2 |= ((height >> 10 ) & 1) 
 	    << RASTER_TIMING_2_LPP_B10_SHIFT;
 
 	/* clock signal settings */
 	if (panel.sync_ctrl)
 		timing2 |= RASTER_TIMING_2_PHSVS;
 	if (panel.sync_edge)
 		timing2 |= RASTER_TIMING_2_PHSVS_RISE;
 	else
 		timing2 |= RASTER_TIMING_2_PHSVS_FALL;
 	if (panel.invert_line_clock)
 		timing2 |= RASTER_TIMING_2_IHS;
 	if (panel.invert_frm_clock)
 		timing2 |= RASTER_TIMING_2_IVS;
 	if (panel.panel_invert_pxl_clk)
 		timing2 |= RASTER_TIMING_2_IPC;
 
 	/* AC bias */
 	timing2 |= (panel.ac_bias << RASTER_TIMING_2_ACB_SHIFT);
 	timing2 |= (panel.ac_bias_intrpt << RASTER_TIMING_2_ACBI_SHIFT);
 
 	LCD_WRITE4(sc, LCD_RASTER_TIMING_0, timing0); 
 	LCD_WRITE4(sc, LCD_RASTER_TIMING_1, timing1); 
 	LCD_WRITE4(sc, LCD_RASTER_TIMING_2, timing2); 
 
 	/* DMA settings */
 	reg = LCDDMA_CTRL_FB0_FB1;
 	/* Find power of 2 for current burst size */
 	switch (panel.dma_burst_sz) {
 	case 1:
 		burst_log = 0;
 		break;
 	case 2:
 		burst_log = 1;
 		break;
 	case 4:
 		burst_log = 2;
 		break;
 	case 8:
 		burst_log = 3;
 		break;
 	case 16:
 	default:
 		burst_log = 4;
 		break;
 	}
 	reg |= (burst_log << LCDDMA_CTRL_BURST_SIZE_SHIFT);
 	/* XXX: FIFO TH */
 	reg |= (0 << LCDDMA_CTRL_TH_FIFO_RDY_SHIFT);
 	LCD_WRITE4(sc, LCD_LCDDMA_CTRL, reg); 
 
 	LCD_WRITE4(sc, LCD_LCDDMA_FB0_BASE, sc->sc_fb_phys); 
 	LCD_WRITE4(sc, LCD_LCDDMA_FB0_CEILING, sc->sc_fb_phys + sc->sc_fb_size - 1); 
 	LCD_WRITE4(sc, LCD_LCDDMA_FB1_BASE, sc->sc_fb_phys); 
 	LCD_WRITE4(sc, LCD_LCDDMA_FB1_CEILING, sc->sc_fb_phys + sc->sc_fb_size - 1); 
 
 	/* Enable LCD */
 	reg = RASTER_CTRL_LCDTFT;
 	reg |= (panel.fdd << RASTER_CTRL_REQDLY_SHIFT);
 	reg |= (PALETTE_DATA_ONLY << RASTER_CTRL_PALMODE_SHIFT);
 	if (panel.bpp >= 24)
 		reg |= RASTER_CTRL_TFT24;
 	if (panel.bpp == 32)
 		reg |= RASTER_CTRL_TFT24_UNPACKED;
 	LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); 
 
 	LCD_WRITE4(sc, LCD_CLKC_ENABLE,
 	    CLKC_ENABLE_DMA | CLKC_ENABLE_LDID | CLKC_ENABLE_CORE);
 
 	LCD_WRITE4(sc, LCD_CLKC_RESET, CLKC_RESET_MAIN);
 	DELAY(100);
 	LCD_WRITE4(sc, LCD_CLKC_RESET, 0);
 
 	reg = IRQ_EOF1 | IRQ_EOF0 | IRQ_FUF | IRQ_PL |
 	    IRQ_ACB | IRQ_SYNC_LOST |  IRQ_RASTER_DONE |
 	    IRQ_FRAME_DONE;
 	LCD_WRITE4(sc, LCD_IRQENABLE_SET, reg);
 
 	reg = LCD_READ4(sc, LCD_RASTER_CTRL);
  	reg |= RASTER_CTRL_LCDEN;
 	LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); 
 
 	LCD_WRITE4(sc, LCD_SYSCONFIG,
 	    SYSCONFIG_STANDBY_SMART | SYSCONFIG_IDLE_SMART); 
 
 	/* Init backlight interface */
 	ctx = device_get_sysctl_ctx(sc->sc_dev);
 	tree = device_get_sysctl_tree(sc->sc_dev);
 	sc->sc_oid = SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 	    "backlight", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
 	    am335x_lcd_sysctl_backlight, "I", "LCD backlight");
 	sc->sc_backlight = 0;
 	/* Check if eCAS interface is available at this point */
 	if (am335x_pwm_config_ecas(PWM_UNIT,
 	    PWM_PERIOD, PWM_PERIOD) == 0)
 		sc->sc_backlight = 100;
 
+	sc->sc_fb_info.fb_name = device_get_nameunit(sc->sc_dev);
+	sc->sc_fb_info.fb_vbase = (intptr_t)sc->sc_fb_base;
+	sc->sc_fb_info.fb_pbase = sc->sc_fb_phys;
+	sc->sc_fb_info.fb_size = sc->sc_fb_size;
+	sc->sc_fb_info.fb_bpp = sc->sc_fb_info.fb_depth = panel.bpp;
+	sc->sc_fb_info.fb_stride = panel.panel_width*panel.bpp / 8;
+	sc->sc_fb_info.fb_width = panel.panel_width;
+	sc->sc_fb_info.fb_height = panel.panel_height;
+
+#ifdef	DEV_SC
 	err = (sc_attach_unit(device_get_unit(dev),
 	    device_get_flags(dev) | SC_AUTODETECT_KBD));
 
 	if (err) {
 		device_printf(dev, "failed to attach syscons\n");
 		goto fail;
 	}
 
 	am335x_lcd_syscons_setup((vm_offset_t)sc->sc_fb_base, sc->sc_fb_phys, &panel);
+#else /* VT */
+	device_t fbd = device_add_child(dev, "fbd",
+	device_get_unit(dev));
+	if (fbd == NULL) {
+		device_printf(dev, "Failed to add fbd child\n");
+		goto fail;
+	}
+	if (device_probe_and_attach(fbd) != 0) {
+		device_printf(dev, "Failed to attach fbd device\n");
+		goto fail;
+	}
+#endif
 
 	return (0);
 
 fail:
 	return (err);
 }
 
 static int
 am335x_lcd_detach(device_t dev)
 {
 	/* Do not let unload driver */
 	return (EBUSY);
 }
 
+static struct fb_info *
+am335x_lcd_fb_getinfo(device_t dev)
+{
+	struct am335x_lcd_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	return (&sc->sc_fb_info);
+}
+
 static device_method_t am335x_lcd_methods[] = {
 	DEVMETHOD(device_probe,		am335x_lcd_probe),
 	DEVMETHOD(device_attach,	am335x_lcd_attach),
 	DEVMETHOD(device_detach,	am335x_lcd_detach),
 
+	/* Framebuffer service methods */
+	DEVMETHOD(fb_getinfo,		am335x_lcd_fb_getinfo),
+
 	DEVMETHOD_END
 };
 
 static driver_t am335x_lcd_driver = {
-	"am335x_lcd",
+	"fb",
 	am335x_lcd_methods,
 	sizeof(struct am335x_lcd_softc),
 };
 
 static devclass_t am335x_lcd_devclass;
 
 DRIVER_MODULE(am335x_lcd, simplebus, am335x_lcd_driver, am335x_lcd_devclass, 0, 0);
 MODULE_VERSION(am335x_lcd, 1);
 MODULE_DEPEND(am335x_lcd, simplebus, 1, 1, 1);
Index: projects/building-blocks/sys/arm/ti/am335x/files.am335x
===================================================================
--- projects/building-blocks/sys/arm/ti/am335x/files.am335x	(revision 277723)
+++ projects/building-blocks/sys/arm/ti/am335x/files.am335x	(revision 277724)
@@ -1,17 +1,17 @@
 #$FreeBSD$
 
 arm/ti/aintc.c				standard
 
 arm/ti/am335x/am335x_dmtimer.c		standard
 arm/ti/am335x/am335x_gpio.c		optional	gpio
-arm/ti/am335x/am335x_lcd.c		optional	sc
+arm/ti/am335x/am335x_lcd.c		optional	sc | vt
 arm/ti/am335x/am335x_lcd_syscons.c	optional	sc
 arm/ti/am335x/am335x_pmic.c		optional	am335x_pmic
 arm/ti/am335x/am335x_prcm.c		standard
 arm/ti/am335x/am335x_pwm.c		standard
 arm/ti/am335x/am335x_rtc.c		optional	am335x_rtc
 arm/ti/am335x/am335x_scm_padconf.c	standard
 arm/ti/am335x/am335x_usbss.c		optional	musb fdt
 
 arm/ti/ti_edma3.c			standard
 arm/ti/cpsw/if_cpsw.c			optional	cpsw
Index: projects/building-blocks/sys/arm/ti/ti_i2c.c
===================================================================
--- projects/building-blocks/sys/arm/ti/ti_i2c.c	(revision 277723)
+++ projects/building-blocks/sys/arm/ti/ti_i2c.c	(revision 277724)
@@ -1,945 +1,979 @@
 /*-
  * Copyright (c) 2011 Ben Gray <ben.r.gray@gmail.com>.
  * Copyright (c) 2014 Luiz Otavio O Souza <loos@freebsd.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /**
  * Driver for the I2C module on the TI SoC.
  *
  * This driver is heavily based on the TWI driver for the AT91 (at91_twi.c).
  *
  * CAUTION: The I2Ci registers are limited to 16 bit and 8 bit data accesses,
  * 32 bit data access is not allowed and can corrupt register content.
  *
  * This driver currently doesn't use DMA for the transfer, although I hope to
  * incorporate that sometime in the future.  The idea being that for transaction
  * larger than a certain size the DMA engine is used, for anything less the
  * normal interrupt/fifo driven option is used.
  *
  *
  * WARNING: This driver uses mtx_sleep and interrupts to perform transactions,
  * which means you can't do a transaction during startup before the interrupts
  * have been enabled.  Hint - the freebsd function config_intrhook_establish().
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/sysctl.h>
 #include <machine/bus.h>
 
 #include <dev/ofw/openfirm.h>
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <arm/ti/ti_cpuid.h>
 #include <arm/ti/ti_prcm.h>
 #include <arm/ti/ti_i2c.h>
 
 #include <dev/iicbus/iiconf.h>
 #include <dev/iicbus/iicbus.h>
 
 #include "iicbus_if.h"
 
 /**
  *	I2C device driver context, a pointer to this is stored in the device
  *	driver structure.
  */
 struct ti_i2c_softc
 {
 	device_t		sc_dev;
 	uint32_t		device_id;
 	struct resource*	sc_irq_res;
 	struct resource*	sc_mem_res;
 	device_t		sc_iicbus;
 
 	void*			sc_irq_h;
 
 	struct mtx		sc_mtx;
 
 	struct iic_msg*		sc_buffer;
 	int			sc_bus_inuse;
 	int			sc_buffer_pos;
 	int			sc_error;
 	int			sc_fifo_trsh;
+	int			sc_timeout;
 
 	uint16_t		sc_con_reg;
 	uint16_t		sc_rev;
 };
 
 struct ti_i2c_clock_config
 {
 	u_int   frequency;	/* Bus frequency in Hz */
 	uint8_t psc;		/* Fast/Standard mode prescale divider */
 	uint8_t scll;		/* Fast/Standard mode SCL low time */
 	uint8_t sclh;		/* Fast/Standard mode SCL high time */
 	uint8_t hsscll;		/* High Speed mode SCL low time */
 	uint8_t hssclh;		/* High Speed mode SCL high time */
 };
 
 #if defined(SOC_OMAP4)
 /*
  * OMAP4 i2c bus clock is 96MHz / ((psc + 1) * (scll + 7 + sclh + 5)).
  * The prescaler values for 100KHz and 400KHz modes come from the table in the
  * OMAP4 TRM.  The table doesn't list 1MHz; these values should give that speed.
  */
 static struct ti_i2c_clock_config ti_omap4_i2c_clock_configs[] = {
 	{  100000, 23,  13,  15,  0,  0},
 	{  400000,  9,   5,   7,  0,  0},
 	{ 1000000,  3,   5,   7,  0,  0},
 /*	{ 3200000,  1, 113, 115,  7, 10}, - HS mode */
 	{       0 /* Table terminator */ }
 };
 #endif
 
 #if defined(SOC_TI_AM335X)
 /*
  * AM335x i2c bus clock is 48MHZ / ((psc + 1) * (scll + 7 + sclh + 5))
  * In all cases we prescale the clock to 24MHz as recommended in the manual.
  */
 static struct ti_i2c_clock_config ti_am335x_i2c_clock_configs[] = {
 	{  100000, 1, 111, 117, 0, 0},
 	{  400000, 1,  23,  25, 0, 0},
 	{ 1000000, 1,   5,   7, 0, 0},
 	{       0 /* Table terminator */ }
 };
 #endif
 
 /**
  *	Locking macros used throughout the driver
  */
 #define	TI_I2C_LOCK(_sc)		mtx_lock(&(_sc)->sc_mtx)
 #define	TI_I2C_UNLOCK(_sc)		mtx_unlock(&(_sc)->sc_mtx)
 #define	TI_I2C_LOCK_INIT(_sc)						\
 	mtx_init(&_sc->sc_mtx, device_get_nameunit(_sc->sc_dev),	\
 	    "ti_i2c", MTX_DEF)
 #define	TI_I2C_LOCK_DESTROY(_sc)	mtx_destroy(&_sc->sc_mtx)
 #define	TI_I2C_ASSERT_LOCKED(_sc)	mtx_assert(&_sc->sc_mtx, MA_OWNED)
 #define	TI_I2C_ASSERT_UNLOCKED(_sc)	mtx_assert(&_sc->sc_mtx, MA_NOTOWNED)
 
 #ifdef DEBUG
 #define	ti_i2c_dbg(_sc, fmt, args...)					\
 	device_printf((_sc)->sc_dev, fmt, ##args)
 #else
 #define	ti_i2c_dbg(_sc, fmt, args...)
 #endif
 
 /**
  *	ti_i2c_read_2 - reads a 16-bit value from one of the I2C registers
  *	@sc: I2C device context
  *	@off: the byte offset within the register bank to read from.
  *
  *
  *	LOCKING:
  *	No locking required
  *
  *	RETURNS:
  *	16-bit value read from the register.
  */
 static inline uint16_t
 ti_i2c_read_2(struct ti_i2c_softc *sc, bus_size_t off)
 {
 
 	return (bus_read_2(sc->sc_mem_res, off));
 }
 
 /**
  *	ti_i2c_write_2 - writes a 16-bit value to one of the I2C registers
  *	@sc: I2C device context
  *	@off: the byte offset within the register bank to read from.
  *	@val: the value to write into the register
  *
  *	LOCKING:
  *	No locking required
  *
  *	RETURNS:
  *	16-bit value read from the register.
  */
 static inline void
 ti_i2c_write_2(struct ti_i2c_softc *sc, bus_size_t off, uint16_t val)
 {
 
 	bus_write_2(sc->sc_mem_res, off, val);
 }
 
 static int
 ti_i2c_transfer_intr(struct ti_i2c_softc* sc, uint16_t status)
 {
 	int amount, done, i;
 
 	done = 0;
 	amount = 0;
 	/* Check for the error conditions. */
 	if (status & I2C_STAT_NACK) {
 		/* No ACK from slave. */
 		ti_i2c_dbg(sc, "NACK\n");
 		ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_NACK);
 		sc->sc_error = ENXIO;
 	} else if (status & I2C_STAT_AL) {
 		/* Arbitration lost. */
 		ti_i2c_dbg(sc, "Arbitration lost\n");
 		ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_AL);
 		sc->sc_error = ENXIO;
 	}
 
 	/* Check if we have finished. */
 	if (status & I2C_STAT_ARDY) {
 		/* Register access ready - transaction complete basically. */
 		ti_i2c_dbg(sc, "ARDY transaction complete\n");
 		if (sc->sc_error != 0 && sc->sc_buffer->flags & IIC_M_NOSTOP) {
 			ti_i2c_write_2(sc, I2C_REG_CON,
 			    sc->sc_con_reg | I2C_CON_STP);
 		}
 		ti_i2c_write_2(sc, I2C_REG_STATUS,
 		    I2C_STAT_ARDY | I2C_STAT_RDR | I2C_STAT_RRDY |
 		    I2C_STAT_XDR | I2C_STAT_XRDY);
 		return (1);
 	}
 
 	if (sc->sc_buffer->flags & IIC_M_RD) {
 		/* Read some data. */
 		if (status & I2C_STAT_RDR) {
 			/*
 			 * Receive draining interrupt - last data received.
 			 * The set FIFO threshold wont be reached to trigger
 			 * RRDY.
 			 */
 			ti_i2c_dbg(sc, "Receive draining interrupt\n");
 
 			/*
 			 * Drain the FIFO.  Read the pending data in the FIFO.
 			 */
 			amount = sc->sc_buffer->len - sc->sc_buffer_pos;
 		} else if (status & I2C_STAT_RRDY) {
 			/*
 			 * Receive data ready interrupt - FIFO has reached the
 			 * set threshold.
 			 */
 			ti_i2c_dbg(sc, "Receive data ready interrupt\n");
 
 			amount = min(sc->sc_fifo_trsh,
 			    sc->sc_buffer->len - sc->sc_buffer_pos);
 		}
 
 		/* Read the bytes from the fifo. */
 		for (i = 0; i < amount; i++)
 			sc->sc_buffer->buf[sc->sc_buffer_pos++] = 
 			    (uint8_t)(ti_i2c_read_2(sc, I2C_REG_DATA) & 0xff);
 
 		if (status & I2C_STAT_RDR)
 			ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_RDR);
 		if (status & I2C_STAT_RRDY)
 			ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_RRDY);
 
 	} else {
 		/* Write some data. */
 		if (status & I2C_STAT_XDR) {
 			/*
 			 * Transmit draining interrupt - FIFO level is below
 			 * the set threshold and the amount of data still to
 			 * be transferred wont reach the set FIFO threshold.
 			 */
 			ti_i2c_dbg(sc, "Transmit draining interrupt\n");
 
 			/*
 			 * Drain the TX data.  Write the pending data in the
 			 * FIFO.
 			 */
 			amount = sc->sc_buffer->len - sc->sc_buffer_pos;
 		} else if (status & I2C_STAT_XRDY) {
 			/*
 			 * Transmit data ready interrupt - the FIFO level
 			 * is below the set threshold.
 			 */
 			ti_i2c_dbg(sc, "Transmit data ready interrupt\n");
 
 			amount = min(sc->sc_fifo_trsh,
 			    sc->sc_buffer->len - sc->sc_buffer_pos);
 		}
 
 		/* Write the bytes from the fifo. */
 		for (i = 0; i < amount; i++)
 			ti_i2c_write_2(sc, I2C_REG_DATA,
 			    sc->sc_buffer->buf[sc->sc_buffer_pos++]);
 
 		if (status & I2C_STAT_XDR)
 			ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_XDR);
 		if (status & I2C_STAT_XRDY)
 			ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_XRDY);
 	}
 
 	return (done);
 }
 
 /**
  *	ti_i2c_intr - interrupt handler for the I2C module
  *	@dev: i2c device handle
  *
  *
  *
  *	LOCKING:
  *	Called from timer context
  *
  *	RETURNS:
  *	EH_HANDLED or EH_NOT_HANDLED
  */
 static void
 ti_i2c_intr(void *arg)
 {
 	int done;
 	struct ti_i2c_softc *sc;
 	uint16_t events, status;
 
  	sc = (struct ti_i2c_softc *)arg;
 
 	TI_I2C_LOCK(sc);
 
 	status = ti_i2c_read_2(sc, I2C_REG_STATUS);
 	if (status == 0) {
 		TI_I2C_UNLOCK(sc);
 		return;
 	}
 
 	/* Save enabled interrupts. */
 	events = ti_i2c_read_2(sc, I2C_REG_IRQENABLE_SET);
 
 	/* We only care about enabled interrupts. */
 	status &= events;
 
 	done = 0;
 
 	if (sc->sc_buffer != NULL)
 		done = ti_i2c_transfer_intr(sc, status);
 	else {
 		ti_i2c_dbg(sc, "Transfer interrupt without buffer\n");
 		sc->sc_error = EINVAL;
 		done = 1;
 	}
 
 	if (done)
 		/* Wakeup the process that started the transaction. */
 		wakeup(sc);
 
 	TI_I2C_UNLOCK(sc);
 }
 
 /**
  *	ti_i2c_transfer - called to perform the transfer
  *	@dev: i2c device handle
  *	@msgs: the messages to send/receive
  *	@nmsgs: the number of messages in the msgs array
  *
  *
  *	LOCKING:
  *	Internally locked
  *
  *	RETURNS:
  *	0 on function succeeded
  *	EINVAL if invalid message is passed as an arg
  */
 static int
 ti_i2c_transfer(device_t dev, struct iic_msg *msgs, uint32_t nmsgs)
 {
 	int err, i, repstart, timeout;
 	struct ti_i2c_softc *sc;
 	uint16_t reg;
 
  	sc = device_get_softc(dev);
 	TI_I2C_LOCK(sc);
 
 	/* If the controller is busy wait until it is available. */
 	while (sc->sc_bus_inuse == 1)
 		mtx_sleep(sc, &sc->sc_mtx, 0, "i2cbuswait", 0);
 
 	/* Now we have control over the I2C controller. */
 	sc->sc_bus_inuse = 1;
 
 	err = 0;
 	repstart = 0;
 	for (i = 0; i < nmsgs; i++) {
 
 		sc->sc_buffer = &msgs[i];
 		sc->sc_buffer_pos = 0;
 		sc->sc_error = 0;
 
 		/* Zero byte transfers aren't allowed. */
 		if (sc->sc_buffer == NULL || sc->sc_buffer->buf == NULL ||
 		    sc->sc_buffer->len == 0) {
 			err = EINVAL;
 			break;
 		}
 
 		/* Check if the i2c bus is free. */
 		if (repstart == 0) {
 			/*
 			 * On repeated start we send the START condition while
 			 * the bus _is_ busy.
 			 */
 			timeout = 0;
 			while (ti_i2c_read_2(sc, I2C_REG_STATUS_RAW) & I2C_STAT_BB) {
 				if (timeout++ > 100) {
 					err = EBUSY;
 					goto out;
 				}
 				DELAY(1000);
 			}
 			timeout = 0;
 		} else
 			repstart = 0;
 
 		if (sc->sc_buffer->flags & IIC_M_NOSTOP)
 			repstart = 1;
 
 		/* Set the slave address. */
 		ti_i2c_write_2(sc, I2C_REG_SA, msgs[i].slave >> 1);
 
 		/* Write the data length. */
 		ti_i2c_write_2(sc, I2C_REG_CNT, sc->sc_buffer->len);
 
 		/* Clear the RX and the TX FIFO. */
 		reg = ti_i2c_read_2(sc, I2C_REG_BUF);
 		reg |= I2C_BUF_RXFIFO_CLR | I2C_BUF_TXFIFO_CLR;
 		ti_i2c_write_2(sc, I2C_REG_BUF, reg);
 
 		reg = sc->sc_con_reg | I2C_CON_STT;
 		if (repstart == 0)
 			reg |= I2C_CON_STP;
 		if ((sc->sc_buffer->flags & IIC_M_RD) == 0)
 			reg |= I2C_CON_TRX;
 		ti_i2c_write_2(sc, I2C_REG_CON, reg);
 
 		/* Wait for an event. */
-		err = mtx_sleep(sc, &sc->sc_mtx, 0, "i2ciowait", hz);
+		err = mtx_sleep(sc, &sc->sc_mtx, 0, "i2ciowait", sc->sc_timeout);
 		if (err == 0)
 			err = sc->sc_error;
 
 		if (err)
 			break;
 	}
 
 out:
 	if (timeout == 0) {
 		while (ti_i2c_read_2(sc, I2C_REG_STATUS_RAW) & I2C_STAT_BB) {
 			if (timeout++ > 100)
 				break;
 			DELAY(1000);
 		}
 	}
 	/* Put the controller in master mode again. */
 	if ((ti_i2c_read_2(sc, I2C_REG_CON) & I2C_CON_MST) == 0)
 		ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg);
 
 	sc->sc_buffer = NULL;
 	sc->sc_bus_inuse = 0;
 
 	/* Wake up the processes that are waiting for the bus. */
 	wakeup(sc);
 
 	TI_I2C_UNLOCK(sc);
 
 	return (err);
 }
 
 static int
 ti_i2c_reset(struct ti_i2c_softc *sc, u_char speed)
 {
 	int timeout;
 	struct ti_i2c_clock_config *clkcfg;
 	u_int busfreq;
 	uint16_t fifo_trsh, reg, scll, sclh;
 
 	switch (ti_chip()) {
 #ifdef SOC_OMAP4
 	case CHIP_OMAP_4:
 		clkcfg = ti_omap4_i2c_clock_configs;
 		break;
 #endif
 #ifdef SOC_TI_AM335X
 	case CHIP_AM335X:
 		clkcfg = ti_am335x_i2c_clock_configs;
 		break;
 #endif
 	default:
 		panic("Unknown Ti SoC, unable to reset the i2c");
 	}
 
 	/*
 	 * If we haven't attached the bus yet, just init at the default slow
 	 * speed.  This lets us get the hardware initialized enough to attach
 	 * the bus which is where the real speed configuration is handled. After
 	 * the bus is attached, get the configured speed from it.  Search the
 	 * configuration table for the best speed we can do that doesn't exceed
 	 * the requested speed.
 	 */
 	if (sc->sc_iicbus == NULL)
 		busfreq = 100000;
 	else
 		busfreq = IICBUS_GET_FREQUENCY(sc->sc_iicbus, speed);
 	for (;;) {
 		if (clkcfg[1].frequency == 0 || clkcfg[1].frequency > busfreq)
 			break;
 		clkcfg++;
 	}
 
 	/*
 	 * 23.1.4.3 - HS I2C Software Reset
 	 *    From OMAP4 TRM at page 4068.
 	 *
 	 * 1. Ensure that the module is disabled.
 	 */
 	sc->sc_con_reg = 0;
 	ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg);
 
 	/* 2. Issue a softreset to the controller. */
 	bus_write_2(sc->sc_mem_res, I2C_REG_SYSC, I2C_REG_SYSC_SRST);
 
 	/*
 	 * 3. Enable the module.
 	 *    The I2Ci.I2C_SYSS[0] RDONE bit is asserted only after the module
 	 *    is enabled by setting the I2Ci.I2C_CON[15] I2C_EN bit to 1.
 	 */
 	ti_i2c_write_2(sc, I2C_REG_CON, I2C_CON_I2C_EN);
 
  	/* 4. Wait for the software reset to complete. */
 	timeout = 0;
 	while ((ti_i2c_read_2(sc, I2C_REG_SYSS) & I2C_SYSS_RDONE) == 0) {
 		if (timeout++ > 100)
 			return (EBUSY);
 		DELAY(100);
 	}
 
 	/*
 	 * Disable the I2C controller once again, now that the reset has
 	 * finished.
 	 */
 	ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg);
 
 	/*
 	 * The following sequence is taken from the OMAP4 TRM at page 4077.
 	 *
 	 * 1. Enable the functional and interface clocks (see Section
 	 *    23.1.5.1.1.1.1).  Done at ti_i2c_activate().
 	 *
 	 * 2. Program the prescaler to obtain an approximately 12MHz internal
 	 *    sampling clock (I2Ci_INTERNAL_CLK) by programming the
 	 *    corresponding value in the I2Ci.I2C_PSC[3:0] PSC field.
 	 *    This value depends on the frequency of the functional clock
 	 *    (I2Ci_FCLK).  Because this frequency is 96MHz, the
 	 *    I2Ci.I2C_PSC[7:0] PSC field value is 0x7.
 	 */
 	ti_i2c_write_2(sc, I2C_REG_PSC, clkcfg->psc);
 
 	/*
 	 * 3. Program the I2Ci.I2C_SCLL[7:0] SCLL and I2Ci.I2C_SCLH[7:0] SCLH
 	 *    bit fields to obtain a bit rate of 100 Kbps, 400 Kbps or 1Mbps.
 	 *    These values depend on the internal sampling clock frequency
 	 *    (see Table 23-8).
 	 */
 	scll = clkcfg->scll & I2C_SCLL_MASK;
 	sclh = clkcfg->sclh & I2C_SCLH_MASK;
 
 	/*
 	 * 4. (Optional) Program the I2Ci.I2C_SCLL[15:8] HSSCLL and
 	 *    I2Ci.I2C_SCLH[15:8] HSSCLH fields to obtain a bit rate of
 	 *    400K bps or 3.4M bps (for the second phase of HS mode).  These
 	 *    values depend on the internal sampling clock frequency (see
 	 *    Table 23-8).
 	 *
 	 * 5. (Optional) If a bit rate of 3.4M bps is used and the bus line
 	 *    capacitance exceeds 45 pF, (see Section 18.4.8, PAD Functional
 	 *    Multiplexing and Configuration).
 	 */
 	switch (ti_chip()) {
 #ifdef SOC_OMAP4
 	case CHIP_OMAP_4:
 		if ((clkcfg->hsscll + clkcfg->hssclh) > 0) {
 			scll |= clkcfg->hsscll << I2C_HSSCLL_SHIFT;
 			sclh |= clkcfg->hssclh << I2C_HSSCLH_SHIFT;
 			sc->sc_con_reg |= I2C_CON_OPMODE_HS;
 		}
 		break;
 #endif
 	}
 
 	/* Write the selected bit rate. */
 	ti_i2c_write_2(sc, I2C_REG_SCLL, scll);
 	ti_i2c_write_2(sc, I2C_REG_SCLH, sclh);
 
 	/*
 	 * 6. Configure the Own Address of the I2C controller by storing it in
 	 *    the I2Ci.I2C_OA0 register.  Up to four Own Addresses can be
 	 *    programmed in the I2Ci.I2C_OAi registers (where i = 0, 1, 2, 3)
 	 *    for each I2C controller.
 	 *
 	 * Note: For a 10-bit address, set the corresponding expand Own Address
 	 * bit in the I2Ci.I2C_CON register.
 	 *
 	 * Driver currently always in single master mode so ignore this step.
 	 */
 
 	/*
 	 * 7. Set the TX threshold (in transmitter mode) and the RX threshold
 	 *    (in receiver mode) by setting the I2Ci.I2C_BUF[5:0]XTRSH field to
 	 *    (TX threshold - 1) and the I2Ci.I2C_BUF[13:8]RTRSH field to (RX
 	 *    threshold - 1), where the TX and RX thresholds are greater than
 	 *    or equal to 1.
 	 *
 	 * The threshold is set to 5 for now.
 	 */
 	fifo_trsh = (sc->sc_fifo_trsh - 1) & I2C_BUF_TRSH_MASK;
 	reg = fifo_trsh | (fifo_trsh << I2C_BUF_RXTRSH_SHIFT);
 	ti_i2c_write_2(sc, I2C_REG_BUF, reg);
 
 	/*
 	 * 8. Take the I2C controller out of reset by setting the
 	 *    I2Ci.I2C_CON[15] I2C_EN bit to 1.
 	 *
 	 * 23.1.5.1.1.1.2 - Initialize the I2C Controller
 	 *
 	 * To initialize the I2C controller, perform the following steps:
 	 *
 	 * 1. Configure the I2Ci.I2C_CON register:
 	 *     . For master or slave mode, set the I2Ci.I2C_CON[10] MST bit
 	 *       (0: slave, 1: master).
 	 *     . For transmitter or receiver mode, set the I2Ci.I2C_CON[9] TRX
 	 *       bit (0: receiver, 1: transmitter).
 	 */
 
 	/* Enable the I2C controller in master mode. */
 	sc->sc_con_reg |= I2C_CON_I2C_EN | I2C_CON_MST;
 	ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg);
 
 	/*
 	 * 2. If using an interrupt to transmit/receive data, set the
 	 *    corresponding bit in the I2Ci.I2C_IE register (the I2Ci.I2C_IE[4]
 	 *    XRDY_IE bit for the transmit interrupt, the I2Ci.I2C_IE[3] RRDY
 	 *    bit for the receive interrupt).
 	 */
 
 	/* Set the interrupts we want to be notified. */
 	reg = I2C_IE_XDR |	/* Transmit draining interrupt. */
 	    I2C_IE_XRDY |	/* Transmit Data Ready interrupt. */
 	    I2C_IE_RDR |	/* Receive draining interrupt. */
 	    I2C_IE_RRDY |	/* Receive Data Ready interrupt. */
 	    I2C_IE_ARDY |	/* Register Access Ready interrupt. */
 	    I2C_IE_NACK |	/* No Acknowledgment interrupt. */
 	    I2C_IE_AL;		/* Arbitration lost interrupt. */
 
 	/* Enable the interrupts. */
 	ti_i2c_write_2(sc, I2C_REG_IRQENABLE_SET, reg);
 
 	/*
 	 * 3. If using DMA to receive/transmit data, set to 1 the corresponding
 	 *    bit in the I2Ci.I2C_BUF register (the I2Ci.I2C_BUF[15] RDMA_EN
 	 *    bit for the receive DMA channel, the I2Ci.I2C_BUF[7] XDMA_EN bit
 	 *    for the transmit DMA channel).
 	 *
 	 * Not using DMA for now, so ignore this.
 	 */
 
 	return (0);
 }
 
 static int
 ti_i2c_iicbus_reset(device_t dev, u_char speed, u_char addr, u_char *oldaddr)
 {
 	struct ti_i2c_softc *sc;
 	int err;
 
 	sc = device_get_softc(dev);
 	TI_I2C_LOCK(sc);
 	err = ti_i2c_reset(sc, speed);
 	TI_I2C_UNLOCK(sc);
 	if (err)
 		return (err);
 
 	return (IIC_ENOADDR);
 }
 
 static int
 ti_i2c_activate(device_t dev)
 {
 	clk_ident_t clk;
 	int err;
 	struct ti_i2c_softc *sc;
 
 	sc = (struct ti_i2c_softc*)device_get_softc(dev);
 
 	/*
 	 * 1. Enable the functional and interface clocks (see Section
 	 * 23.1.5.1.1.1.1).
 	 */
 	clk = I2C0_CLK + sc->device_id;
 	err = ti_prcm_clk_enable(clk);
 	if (err)
 		return (err);
 
 	return (ti_i2c_reset(sc, IIC_UNKNOWN));
 }
 
 /**
  *	ti_i2c_deactivate - deactivates the controller and releases resources
  *	@dev: i2c device handle
  *
  *
  *
  *	LOCKING:
  *	Assumed called in an atomic context.
  *
  *	RETURNS:
  *	nothing
  */
 static void
 ti_i2c_deactivate(device_t dev)
 {
 	struct ti_i2c_softc *sc = device_get_softc(dev);
 	clk_ident_t clk;
 
 	/* Disable the controller - cancel all transactions. */
 	ti_i2c_write_2(sc, I2C_REG_IRQENABLE_CLR, 0xffff);
 	ti_i2c_write_2(sc, I2C_REG_STATUS, 0xffff);
 	ti_i2c_write_2(sc, I2C_REG_CON, 0);
 
 	/* Release the interrupt handler. */
 	if (sc->sc_irq_h != NULL) {
 		bus_teardown_intr(dev, sc->sc_irq_res, sc->sc_irq_h);
 		sc->sc_irq_h = NULL;
 	}
 
 	bus_generic_detach(sc->sc_dev);
 
 	/* Unmap the I2C controller registers. */
 	if (sc->sc_mem_res != NULL) {
 		bus_release_resource(dev, SYS_RES_MEMORY, 0, sc->sc_mem_res);
 		sc->sc_mem_res = NULL;
 	}
 
 	/* Release the IRQ resource. */
 	if (sc->sc_irq_res != NULL) {
 		bus_release_resource(dev, SYS_RES_IRQ, 0, sc->sc_irq_res);
 		sc->sc_irq_res = NULL;
 	}
 
 	/* Finally disable the functional and interface clocks. */
 	clk = I2C0_CLK + sc->device_id;
 	ti_prcm_clk_disable(clk);
 }
 
 static int
 ti_i2c_sysctl_clk(SYSCTL_HANDLER_ARGS)
 {
-	device_t dev;
 	int clk, psc, sclh, scll;
 	struct ti_i2c_softc *sc;
 
-	dev = (device_t)arg1;
-	sc = device_get_softc(dev);
+	sc = arg1;
 
 	TI_I2C_LOCK(sc);
 	/* Get the system prescaler value. */
 	psc = (int)ti_i2c_read_2(sc, I2C_REG_PSC) + 1;
 
 	/* Get the bitrate. */
 	scll = (int)ti_i2c_read_2(sc, I2C_REG_SCLL) & I2C_SCLL_MASK;
 	sclh = (int)ti_i2c_read_2(sc, I2C_REG_SCLH) & I2C_SCLH_MASK;
 
 	clk = I2C_CLK / psc / (scll + 7 + sclh + 5);
 	TI_I2C_UNLOCK(sc);
 
 	return (sysctl_handle_int(oidp, &clk, 0, req));
 }
 
 static int
+ti_i2c_sysctl_timeout(SYSCTL_HANDLER_ARGS)
+{
+	struct ti_i2c_softc *sc;
+	unsigned int val;
+	int err;
+
+	sc = arg1;
+
+	/* 
+	 * MTX_DEF lock can't be held while doing uimove in
+	 * sysctl_handle_int
+	 */
+	TI_I2C_LOCK(sc);
+	val = sc->sc_timeout;
+	TI_I2C_UNLOCK(sc);
+
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	/* Write request? */
+	if ((err == 0) && (req->newptr != NULL)) {
+		TI_I2C_LOCK(sc);
+		sc->sc_timeout = val;
+		TI_I2C_UNLOCK(sc);
+	}
+
+	return (err);
+}
+
+static int
 ti_i2c_probe(device_t dev)
 {
 
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 	if (!ofw_bus_is_compatible(dev, "ti,i2c"))
 		return (ENXIO);
 	device_set_desc(dev, "TI I2C Controller");
 
 	return (0);
 }
 
 static int
 ti_i2c_attach(device_t dev)
 {
 	int err, rid;
 	phandle_t node;
 	struct ti_i2c_softc *sc;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid_list *tree;
 	uint16_t fifosz;
 
  	sc = device_get_softc(dev);
 	sc->sc_dev = dev;
 
 	/* Get the i2c device id from FDT. */
 	node = ofw_bus_get_node(dev);
 	if ((OF_getencprop(node, "i2c-device-id", &sc->device_id,
 	    sizeof(sc->device_id))) <= 0) {
 		device_printf(dev, "missing i2c-device-id attribute in FDT\n");
 		return (ENXIO);
 	}
 
 	/* Get the memory resource for the register mapping. */
 	rid = 0;
 	sc->sc_mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
 	    RF_ACTIVE);
 	if (sc->sc_mem_res == NULL) {
 		device_printf(dev, "Cannot map registers.\n");
 		return (ENXIO);
 	}
 
 	/* Allocate our IRQ resource. */
 	rid = 0;
 	sc->sc_irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
 	    RF_ACTIVE | RF_SHAREABLE);
 	if (sc->sc_irq_res == NULL) {
 		bus_release_resource(dev, SYS_RES_MEMORY, 0, sc->sc_mem_res);
 		device_printf(dev, "Cannot allocate interrupt.\n");
 		return (ENXIO);
 	}
 
 	TI_I2C_LOCK_INIT(sc);
 
 	/* First of all, we _must_ activate the H/W. */
 	err = ti_i2c_activate(dev);
 	if (err) {
 		device_printf(dev, "ti_i2c_activate failed\n");
 		goto out;
 	}
 
 	/* Read the version number of the I2C module */
 	sc->sc_rev = ti_i2c_read_2(sc, I2C_REG_REVNB_HI) & 0xff;
 
 	/* Get the fifo size. */
 	fifosz = ti_i2c_read_2(sc, I2C_REG_BUFSTAT);
 	fifosz >>= I2C_BUFSTAT_FIFODEPTH_SHIFT;
 	fifosz &= I2C_BUFSTAT_FIFODEPTH_MASK;
 
 	device_printf(dev, "I2C revision %d.%d FIFO size: %d bytes\n",
 	    sc->sc_rev >> 4, sc->sc_rev & 0xf, 8 << fifosz);
 
 	/* Set the FIFO threshold to 5 for now. */
 	sc->sc_fifo_trsh = 5;
 
+	/* Set I2C bus timeout */
+	sc->sc_timeout = 5*hz;
+
 	ctx = device_get_sysctl_ctx(dev);
 	tree = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 	SYSCTL_ADD_PROC(ctx, tree, OID_AUTO, "i2c_clock",
-	    CTLFLAG_RD | CTLTYPE_UINT | CTLFLAG_MPSAFE, dev, 0,
+	    CTLFLAG_RD | CTLTYPE_UINT | CTLFLAG_MPSAFE, sc, 0,
 	    ti_i2c_sysctl_clk, "IU", "I2C bus clock");
+
+	SYSCTL_ADD_PROC(ctx, tree, OID_AUTO, "i2c_timeout",
+	    CTLFLAG_RW | CTLTYPE_UINT | CTLFLAG_MPSAFE, sc, 0,
+	    ti_i2c_sysctl_timeout, "IU", "I2C bus timeout (in ticks)");
 
 	/* Activate the interrupt. */
 	err = bus_setup_intr(dev, sc->sc_irq_res, INTR_TYPE_MISC | INTR_MPSAFE,
 	    NULL, ti_i2c_intr, sc, &sc->sc_irq_h);
 	if (err)
 		goto out;
 
 	/* Attach the iicbus. */
 	if ((sc->sc_iicbus = device_add_child(dev, "iicbus", -1)) == NULL) {
 		device_printf(dev, "could not allocate iicbus instance\n");
 		err = ENXIO;
 		goto out;
 	}
 
 	/* Probe and attach the iicbus */
 	bus_generic_attach(dev);
 
 out:
 	if (err) {
 		ti_i2c_deactivate(dev);
 		TI_I2C_LOCK_DESTROY(sc);
 	}
 
 	return (err);
 }
 
 static int
 ti_i2c_detach(device_t dev)
 {
 	struct ti_i2c_softc *sc;
 	int rv;
 
  	sc = device_get_softc(dev);
 	ti_i2c_deactivate(dev);
 	TI_I2C_LOCK_DESTROY(sc);
 	if (sc->sc_iicbus &&
 	    (rv = device_delete_child(dev, sc->sc_iicbus)) != 0)
 		return (rv);
 
 	return (0);
 }
 
 static phandle_t
 ti_i2c_get_node(device_t bus, device_t dev)
 {
 
 	/* Share controller node with iibus device. */
 	return (ofw_bus_get_node(bus));
 }
 
 static device_method_t ti_i2c_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		ti_i2c_probe),
 	DEVMETHOD(device_attach,	ti_i2c_attach),
 	DEVMETHOD(device_detach,	ti_i2c_detach),
 
 	/* OFW methods */
 	DEVMETHOD(ofw_bus_get_node,	ti_i2c_get_node),
 
 	/* iicbus interface */
 	DEVMETHOD(iicbus_callback,	iicbus_null_callback),
 	DEVMETHOD(iicbus_reset,		ti_i2c_iicbus_reset),
 	DEVMETHOD(iicbus_transfer,	ti_i2c_transfer),
 
 	DEVMETHOD_END
 };
 
 static driver_t ti_i2c_driver = {
 	"iichb",
 	ti_i2c_methods,
 	sizeof(struct ti_i2c_softc),
 };
 
 static devclass_t ti_i2c_devclass;
 
 DRIVER_MODULE(ti_iic, simplebus, ti_i2c_driver, ti_i2c_devclass, 0, 0);
 DRIVER_MODULE(iicbus, ti_iic, iicbus_driver, iicbus_devclass, 0, 0);
 
 MODULE_DEPEND(ti_iic, ti_prcm, 1, 1, 1);
 MODULE_DEPEND(ti_iic, iicbus, 1, 1, 1);
Index: projects/building-blocks/sys/dev/mps/mps.c
===================================================================
--- projects/building-blocks/sys/dev/mps/mps.c	(revision 277723)
+++ projects/building-blocks/sys/dev/mps/mps.c	(revision 277724)
@@ -1,2699 +1,2705 @@
 /*-
  * Copyright (c) 2009 Yahoo! Inc.
  * Copyright (c) 2012 LSI Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * LSI MPT-Fusion Host Adapter FreeBSD
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* Communications core for LSI MPT2 */
 
 /* TODO Move headers to mpsvar */
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/selinfo.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/malloc.h>
 #include <sys/uio.h>
 #include <sys/sysctl.h>
 #include <sys/queue.h>
 #include <sys/kthread.h>
 #include <sys/taskqueue.h>
 #include <sys/endian.h>
 #include <sys/eventhandler.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/rman.h>
 #include <sys/proc.h>
 
 #include <dev/pci/pcivar.h>
 
 #include <cam/cam.h>
 #include <cam/scsi/scsi_all.h>
 
 #include <dev/mps/mpi/mpi2_type.h>
 #include <dev/mps/mpi/mpi2.h>
 #include <dev/mps/mpi/mpi2_ioc.h>
 #include <dev/mps/mpi/mpi2_sas.h>
 #include <dev/mps/mpi/mpi2_cnfg.h>
 #include <dev/mps/mpi/mpi2_init.h>
 #include <dev/mps/mpi/mpi2_tool.h>
 #include <dev/mps/mps_ioctl.h>
 #include <dev/mps/mpsvar.h>
 #include <dev/mps/mps_table.h>
 
 static int mps_diag_reset(struct mps_softc *sc, int sleep_flag);
 static int mps_init_queues(struct mps_softc *sc);
 static int mps_message_unit_reset(struct mps_softc *sc, int sleep_flag);
 static int mps_transition_operational(struct mps_softc *sc);
 static int mps_iocfacts_allocate(struct mps_softc *sc, uint8_t attaching);
 static void mps_iocfacts_free(struct mps_softc *sc);
 static void mps_startup(void *arg);
 static int mps_send_iocinit(struct mps_softc *sc);
 static int mps_alloc_queues(struct mps_softc *sc);
 static int mps_alloc_replies(struct mps_softc *sc);
 static int mps_alloc_requests(struct mps_softc *sc);
 static int mps_attach_log(struct mps_softc *sc);
 static __inline void mps_complete_command(struct mps_softc *sc,
     struct mps_command *cm);
 static void mps_dispatch_event(struct mps_softc *sc, uintptr_t data,
     MPI2_EVENT_NOTIFICATION_REPLY *reply);
 static void mps_config_complete(struct mps_softc *sc, struct mps_command *cm);
 static void mps_periodic(void *);
 static int mps_reregister_events(struct mps_softc *sc);
 static void mps_enqueue_request(struct mps_softc *sc, struct mps_command *cm);
 static int mps_get_iocfacts(struct mps_softc *sc, MPI2_IOC_FACTS_REPLY *facts);
 static int mps_wait_db_ack(struct mps_softc *sc, int timeout, int sleep_flag);
 SYSCTL_NODE(_hw, OID_AUTO, mps, CTLFLAG_RD, 0, "MPS Driver Parameters");
 
 MALLOC_DEFINE(M_MPT2, "mps", "mpt2 driver memory");
 
 /*
  * Do a "Diagnostic Reset" aka a hard reset.  This should get the chip out of
  * any state and back to its initialization state machine.
  */
 static char mpt2_reset_magic[] = { 0x00, 0x0f, 0x04, 0x0b, 0x02, 0x07, 0x0d };
 
 /* Added this union to smoothly convert le64toh cm->cm_desc.Words.
  * Compiler only support unint64_t to be passed as argument.
  * Otherwise it will through below error
  * "aggregate value used where an integer was expected"
  */
 
 typedef union _reply_descriptor {
         u64 word;
         struct {
                 u32 low;
                 u32 high;
         } u;
 }reply_descriptor,address_descriptor;
 
 /* Rate limit chain-fail messages to 1 per minute */
 static struct timeval mps_chainfail_interval = { 60, 0 };
 
 /* 
  * sleep_flag can be either CAN_SLEEP or NO_SLEEP.
  * If this function is called from process context, it can sleep
  * and there is no harm to sleep, in case if this fuction is called
  * from Interrupt handler, we can not sleep and need NO_SLEEP flag set.
  * based on sleep flags driver will call either msleep, pause or DELAY.
  * msleep and pause are of same variant, but pause is used when mps_mtx
  * is not hold by driver.
  *
  */
 static int
 mps_diag_reset(struct mps_softc *sc,int sleep_flag)
 {
 	uint32_t reg;
 	int i, error, tries = 0;
 	uint8_t first_wait_done = FALSE;
 
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	/* Clear any pending interrupts */
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 
 	/*Force NO_SLEEP for threads prohibited to sleep
  	* e.a Thread from interrupt handler are prohibited to sleep.
  	*/
 	if (curthread->td_no_sleeping != 0)
 		sleep_flag = NO_SLEEP;
  
 	/* Push the magic sequence */
 	error = ETIMEDOUT;
 	while (tries++ < 20) {
 		for (i = 0; i < sizeof(mpt2_reset_magic); i++)
 			mps_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET,
 			    mpt2_reset_magic[i]);
 		/* wait 100 msec */
 		if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
 			msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0,
 			    "mpsdiag", hz/10);
 		else if (sleep_flag == CAN_SLEEP)
 			pause("mpsdiag", hz/10);
 		else
 			DELAY(100 * 1000);
 
 		reg = mps_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
 		if (reg & MPI2_DIAG_DIAG_WRITE_ENABLE) {
 			error = 0;
 			break;
 		}
 	}
 	if (error)
 		return (error);
 
 	/* Send the actual reset.  XXX need to refresh the reg? */
 	mps_regwrite(sc, MPI2_HOST_DIAGNOSTIC_OFFSET,
 	    reg | MPI2_DIAG_RESET_ADAPTER);
 
 	/* Wait up to 300 seconds in 50ms intervals */
 	error = ETIMEDOUT;
 	for (i = 0; i < 6000; i++) {
 		/*
 		 * Wait 50 msec. If this is the first time through, wait 256
 		 * msec to satisfy Diag Reset timing requirements.
 		 */
 		if (first_wait_done) {
 			if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
 				msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0,
 				    "mpsdiag", hz/20);
 			else if (sleep_flag == CAN_SLEEP)
 				pause("mpsdiag", hz/20);
 			else
 				DELAY(50 * 1000);
 		} else {
 			DELAY(256 * 1000);
 			first_wait_done = TRUE;
 		}
 		/*
 		 * Check for the RESET_ADAPTER bit to be cleared first, then
 		 * wait for the RESET state to be cleared, which takes a little
 		 * longer.
 		 */
 		reg = mps_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
 		if (reg & MPI2_DIAG_RESET_ADAPTER) {
 			continue;
 		}
 		reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 		if ((reg & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_RESET) {
 			error = 0;
 			break;
 		}
 	}
 	if (error)
 		return (error);
 
 	mps_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, 0x0);
 
 	return (0);
 }
 
 static int
 mps_message_unit_reset(struct mps_softc *sc, int sleep_flag)
 {
 
 	MPS_FUNCTRACE(sc);
 
 	mps_regwrite(sc, MPI2_DOORBELL_OFFSET,
 	    MPI2_FUNCTION_IOC_MESSAGE_UNIT_RESET <<
 	    MPI2_DOORBELL_FUNCTION_SHIFT);
 
 	if (mps_wait_db_ack(sc, 5, sleep_flag) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Doorbell handshake failed : <%s>\n",
 				__func__);
 		return (ETIMEDOUT);
 	}
 
 	return (0);
 }
 
 static int
 mps_transition_ready(struct mps_softc *sc)
 {
 	uint32_t reg, state;
 	int error, tries = 0;
 	int sleep_flags;
 
 	MPS_FUNCTRACE(sc);
 	/* If we are in attach call, do not sleep */
 	sleep_flags = (sc->mps_flags & MPS_FLAGS_ATTACH_DONE)
 					? CAN_SLEEP:NO_SLEEP;
 	error = 0;
 	while (tries++ < 1200) {
 		reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 		mps_dprint(sc, MPS_INIT, "Doorbell= 0x%x\n", reg);
 
 		/*
 		 * Ensure the IOC is ready to talk.  If it's not, try
 		 * resetting it.
 		 */
 		if (reg & MPI2_DOORBELL_USED) {
 			mps_diag_reset(sc, sleep_flags);
 			DELAY(50000);
 			continue;
 		}
 
 		/* Is the adapter owned by another peer? */
 		if ((reg & MPI2_DOORBELL_WHO_INIT_MASK) ==
 		    (MPI2_WHOINIT_PCI_PEER << MPI2_DOORBELL_WHO_INIT_SHIFT)) {
 			device_printf(sc->mps_dev, "IOC is under the control "
 			    "of another peer host, aborting initialization.\n");
 			return (ENXIO);
 		}
 		
 		state = reg & MPI2_IOC_STATE_MASK;
 		if (state == MPI2_IOC_STATE_READY) {
 			/* Ready to go! */
 			error = 0;
 			break;
 		} else if (state == MPI2_IOC_STATE_FAULT) {
 			mps_dprint(sc, MPS_FAULT, "IOC in fault state 0x%x, resetting\n",
 			    state & MPI2_DOORBELL_FAULT_CODE_MASK);
 			mps_diag_reset(sc, sleep_flags);
 		} else if (state == MPI2_IOC_STATE_OPERATIONAL) {
 			/* Need to take ownership */
 			mps_message_unit_reset(sc, sleep_flags);
 		} else if (state == MPI2_IOC_STATE_RESET) {
 			/* Wait a bit, IOC might be in transition */
 			mps_dprint(sc, MPS_FAULT,
 			    "IOC in unexpected reset state\n");
 		} else {
 			mps_dprint(sc, MPS_FAULT,
 			    "IOC in unknown state 0x%x\n", state);
 			error = EINVAL;
 			break;
 		}
 	
 		/* Wait 50ms for things to settle down. */
 		DELAY(50000);
 	}
 
 	if (error)
 		device_printf(sc->mps_dev, "Cannot transition IOC to ready\n");
 
 	return (error);
 }
 
 static int
 mps_transition_operational(struct mps_softc *sc)
 {
 	uint32_t reg, state;
 	int error;
 
 	MPS_FUNCTRACE(sc);
 
 	error = 0;
 	reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 	mps_dprint(sc, MPS_INIT, "Doorbell= 0x%x\n", reg);
 
 	state = reg & MPI2_IOC_STATE_MASK;
 	if (state != MPI2_IOC_STATE_READY) {
 		if ((error = mps_transition_ready(sc)) != 0) {
 			mps_dprint(sc, MPS_FAULT, 
 			    "%s failed to transition ready\n", __func__);
 			return (error);
 		}
 	}
 
 	error = mps_send_iocinit(sc);
 	return (error);
 }
 
 /*
  * This is called during attach and when re-initializing due to a Diag Reset.
  * IOC Facts is used to allocate many of the structures needed by the driver.
  * If called from attach, de-allocation is not required because the driver has
  * not allocated any structures yet, but if called from a Diag Reset, previously
  * allocated structures based on IOC Facts will need to be freed and re-
  * allocated bases on the latest IOC Facts.
  */
 static int
 mps_iocfacts_allocate(struct mps_softc *sc, uint8_t attaching)
 {
 	int error;
 	Mpi2IOCFactsReply_t saved_facts;
 	uint8_t saved_mode, reallocating;
 
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	/* Save old IOC Facts and then only reallocate if Facts have changed */
 	if (!attaching) {
 		bcopy(sc->facts, &saved_facts, sizeof(MPI2_IOC_FACTS_REPLY));
 	}
 
 	/*
 	 * Get IOC Facts.  In all cases throughout this function, panic if doing
 	 * a re-initialization and only return the error if attaching so the OS
 	 * can handle it.
 	 */
 	if ((error = mps_get_iocfacts(sc, sc->facts)) != 0) {
 		if (attaching) {
 			mps_dprint(sc, MPS_FAULT, "%s failed to get IOC Facts "
 			    "with error %d\n", __func__, error);
 			return (error);
 		} else {
 			panic("%s failed to get IOC Facts with error %d\n",
 			    __func__, error);
 		}
 	}
 
 	mps_print_iocfacts(sc, sc->facts);
 
 	snprintf(sc->fw_version, sizeof(sc->fw_version), 
 	    "%02d.%02d.%02d.%02d", 
 	    sc->facts->FWVersion.Struct.Major,
 	    sc->facts->FWVersion.Struct.Minor,
 	    sc->facts->FWVersion.Struct.Unit,
 	    sc->facts->FWVersion.Struct.Dev);
 
 	mps_printf(sc, "Firmware: %s, Driver: %s\n", sc->fw_version,
 	    MPS_DRIVER_VERSION);
 	mps_printf(sc, "IOCCapabilities: %b\n", sc->facts->IOCCapabilities,
 	    "\20" "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf"
 	    "\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR"
 	    "\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc");
 
 	/*
 	 * If the chip doesn't support event replay then a hard reset will be
 	 * required to trigger a full discovery.  Do the reset here then
 	 * retransition to Ready.  A hard reset might have already been done,
 	 * but it doesn't hurt to do it again.  Only do this if attaching, not
 	 * for a Diag Reset.
 	 */
 	if (attaching) {
 		if ((sc->facts->IOCCapabilities &
 		    MPI2_IOCFACTS_CAPABILITY_EVENT_REPLAY) == 0) {
 			mps_diag_reset(sc, NO_SLEEP);
 			if ((error = mps_transition_ready(sc)) != 0) {
 				mps_dprint(sc, MPS_FAULT, "%s failed to "
 				    "transition to ready with error %d\n",
 				    __func__, error);
 				return (error);
 			}
 		}
 	}
 
 	/*
 	 * Set flag if IR Firmware is loaded.  If the RAID Capability has
 	 * changed from the previous IOC Facts, log a warning, but only if
 	 * checking this after a Diag Reset and not during attach.
 	 */
 	saved_mode = sc->ir_firmware;
 	if (sc->facts->IOCCapabilities &
 	    MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID)
 		sc->ir_firmware = 1;
 	if (!attaching) {
 		if (sc->ir_firmware != saved_mode) {
 			mps_dprint(sc, MPS_FAULT, "%s new IR/IT mode in IOC "
 			    "Facts does not match previous mode\n", __func__);
 		}
 	}
 
 	/* Only deallocate and reallocate if relevant IOC Facts have changed */
 	reallocating = FALSE;
 	if ((!attaching) &&
 	    ((saved_facts.MsgVersion != sc->facts->MsgVersion) ||
 	    (saved_facts.HeaderVersion != sc->facts->HeaderVersion) ||
 	    (saved_facts.MaxChainDepth != sc->facts->MaxChainDepth) ||
 	    (saved_facts.RequestCredit != sc->facts->RequestCredit) ||
 	    (saved_facts.ProductID != sc->facts->ProductID) ||
 	    (saved_facts.IOCCapabilities != sc->facts->IOCCapabilities) ||
 	    (saved_facts.IOCRequestFrameSize !=
 	    sc->facts->IOCRequestFrameSize) ||
 	    (saved_facts.MaxTargets != sc->facts->MaxTargets) ||
 	    (saved_facts.MaxSasExpanders != sc->facts->MaxSasExpanders) ||
 	    (saved_facts.MaxEnclosures != sc->facts->MaxEnclosures) ||
 	    (saved_facts.HighPriorityCredit != sc->facts->HighPriorityCredit) ||
 	    (saved_facts.MaxReplyDescriptorPostQueueDepth !=
 	    sc->facts->MaxReplyDescriptorPostQueueDepth) ||
 	    (saved_facts.ReplyFrameSize != sc->facts->ReplyFrameSize) ||
 	    (saved_facts.MaxVolumes != sc->facts->MaxVolumes) ||
 	    (saved_facts.MaxPersistentEntries !=
 	    sc->facts->MaxPersistentEntries))) {
 		reallocating = TRUE;
 	}
 
 	/*
 	 * Some things should be done if attaching or re-allocating after a Diag
 	 * Reset, but are not needed after a Diag Reset if the FW has not
 	 * changed.
 	 */
 	if (attaching || reallocating) {
 		/*
 		 * Check if controller supports FW diag buffers and set flag to
 		 * enable each type.
 		 */
 		if (sc->facts->IOCCapabilities &
 		    MPI2_IOCFACTS_CAPABILITY_DIAG_TRACE_BUFFER)
 			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_TRACE].
 			    enabled = TRUE;
 		if (sc->facts->IOCCapabilities &
 		    MPI2_IOCFACTS_CAPABILITY_SNAPSHOT_BUFFER)
 			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_SNAPSHOT].
 			    enabled = TRUE;
 		if (sc->facts->IOCCapabilities &
 		    MPI2_IOCFACTS_CAPABILITY_EXTENDED_BUFFER)
 			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_EXTENDED].
 			    enabled = TRUE;
 
 		/*
 		 * Set flag if EEDP is supported and if TLR is supported.
 		 */
 		if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EEDP)
 			sc->eedp_enabled = TRUE;
 		if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_TLR)
 			sc->control_TLR = TRUE;
 
 		/*
 		 * Size the queues. Since the reply queues always need one free
 		 * entry, we'll just deduct one reply message here.
 		 */
 		sc->num_reqs = MIN(MPS_REQ_FRAMES, sc->facts->RequestCredit);
 		sc->num_replies = MIN(MPS_REPLY_FRAMES + MPS_EVT_REPLY_FRAMES,
 		    sc->facts->MaxReplyDescriptorPostQueueDepth) - 1;
 
 		/*
 		 * Initialize all Tail Queues
 		 */
 		TAILQ_INIT(&sc->req_list);
 		TAILQ_INIT(&sc->high_priority_req_list);
 		TAILQ_INIT(&sc->chain_list);
 		TAILQ_INIT(&sc->tm_list);
 	}
 
 	/*
 	 * If doing a Diag Reset and the FW is significantly different
 	 * (reallocating will be set above in IOC Facts comparison), then all
 	 * buffers based on the IOC Facts will need to be freed before they are
 	 * reallocated.
 	 */
 	if (reallocating) {
 		mps_iocfacts_free(sc);
 		mpssas_realloc_targets(sc, saved_facts.MaxTargets);
 	}
 
 	/*
 	 * Any deallocation has been completed.  Now start reallocating
 	 * if needed.  Will only need to reallocate if attaching or if the new
 	 * IOC Facts are different from the previous IOC Facts after a Diag
 	 * Reset. Targets have already been allocated above if needed.
 	 */
 	if (attaching || reallocating) {
 		if (((error = mps_alloc_queues(sc)) != 0) ||
 		    ((error = mps_alloc_replies(sc)) != 0) ||
 		    ((error = mps_alloc_requests(sc)) != 0)) {
 			if (attaching ) {
 				mps_dprint(sc, MPS_FAULT, "%s failed to alloc "
 				    "queues with error %d\n", __func__, error);
 				mps_free(sc);
 				return (error);
 			} else {
 				panic("%s failed to alloc queues with error "
 				    "%d\n", __func__, error);
 			}
 		}
 	}
 
 	/* Always initialize the queues */
 	bzero(sc->free_queue, sc->fqdepth * 4);
 	mps_init_queues(sc);
 
 	/*
 	 * Always get the chip out of the reset state, but only panic if not
 	 * attaching.  If attaching and there is an error, that is handled by
 	 * the OS.
 	 */
 	error = mps_transition_operational(sc);
 	if (error != 0) {
 		if (attaching) {
 			mps_printf(sc, "%s failed to transition to operational "
 			    "with error %d\n", __func__, error);
 			mps_free(sc);
 			return (error);
 		} else {
 			panic("%s failed to transition to operational with "
 			    "error %d\n", __func__, error);
 		}
 	}
 
 	/*
 	 * Finish the queue initialization.
 	 * These are set here instead of in mps_init_queues() because the
 	 * IOC resets these values during the state transition in
 	 * mps_transition_operational().  The free index is set to 1
 	 * because the corresponding index in the IOC is set to 0, and the
 	 * IOC treats the queues as full if both are set to the same value.
 	 * Hence the reason that the queue can't hold all of the possible
 	 * replies.
 	 */
 	sc->replypostindex = 0;
 	mps_regwrite(sc, MPI2_REPLY_FREE_HOST_INDEX_OFFSET, sc->replyfreeindex);
 	mps_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, 0);
 
 	/*
 	 * Attach the subsystems so they can prepare their event masks.
 	 */
 	/* XXX Should be dynamic so that IM/IR and user modules can attach */
 	if (attaching) {
 		if (((error = mps_attach_log(sc)) != 0) ||
 		    ((error = mps_attach_sas(sc)) != 0) ||
 		    ((error = mps_attach_user(sc)) != 0)) {
 			mps_printf(sc, "%s failed to attach all subsystems: "
 			    "error %d\n", __func__, error);
 			mps_free(sc);
 			return (error);
 		}
 
 		if ((error = mps_pci_setup_interrupts(sc)) != 0) {
 			mps_printf(sc, "%s failed to setup interrupts\n",
 			    __func__);
 			mps_free(sc);
 			return (error);
 		}
 	}
 
 	/*
 	 * Set flag if this is a WD controller.  This shouldn't ever change, but
 	 * reset it after a Diag Reset, just in case.
 	 */
 	sc->WD_available = FALSE;
 	if (pci_get_device(sc->mps_dev) == MPI2_MFGPAGE_DEVID_SSS6200)
 		sc->WD_available = TRUE;
 
 	return (error);
 }
 
 /*
  * This is called if memory is being free (during detach for example) and when
  * buffers need to be reallocated due to a Diag Reset.
  */
 static void
 mps_iocfacts_free(struct mps_softc *sc)
 {
 	struct mps_command *cm;
 	int i;
 
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	if (sc->free_busaddr != 0)
 		bus_dmamap_unload(sc->queues_dmat, sc->queues_map);
 	if (sc->free_queue != NULL)
 		bus_dmamem_free(sc->queues_dmat, sc->free_queue,
 		    sc->queues_map);
 	if (sc->queues_dmat != NULL)
 		bus_dma_tag_destroy(sc->queues_dmat);
 
 	if (sc->chain_busaddr != 0)
 		bus_dmamap_unload(sc->chain_dmat, sc->chain_map);
 	if (sc->chain_frames != NULL)
 		bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
 		    sc->chain_map);
 	if (sc->chain_dmat != NULL)
 		bus_dma_tag_destroy(sc->chain_dmat);
 
 	if (sc->sense_busaddr != 0)
 		bus_dmamap_unload(sc->sense_dmat, sc->sense_map);
 	if (sc->sense_frames != NULL)
 		bus_dmamem_free(sc->sense_dmat, sc->sense_frames,
 		    sc->sense_map);
 	if (sc->sense_dmat != NULL)
 		bus_dma_tag_destroy(sc->sense_dmat);
 
 	if (sc->reply_busaddr != 0)
 		bus_dmamap_unload(sc->reply_dmat, sc->reply_map);
 	if (sc->reply_frames != NULL)
 		bus_dmamem_free(sc->reply_dmat, sc->reply_frames,
 		    sc->reply_map);
 	if (sc->reply_dmat != NULL)
 		bus_dma_tag_destroy(sc->reply_dmat);
 
 	if (sc->req_busaddr != 0)
 		bus_dmamap_unload(sc->req_dmat, sc->req_map);
 	if (sc->req_frames != NULL)
 		bus_dmamem_free(sc->req_dmat, sc->req_frames, sc->req_map);
 	if (sc->req_dmat != NULL)
 		bus_dma_tag_destroy(sc->req_dmat);
 
 	if (sc->chains != NULL)
 		free(sc->chains, M_MPT2);
 	if (sc->commands != NULL) {
 		for (i = 1; i < sc->num_reqs; i++) {
 			cm = &sc->commands[i];
 			bus_dmamap_destroy(sc->buffer_dmat, cm->cm_dmamap);
 		}
 		free(sc->commands, M_MPT2);
 	}
 	if (sc->buffer_dmat != NULL)
 		bus_dma_tag_destroy(sc->buffer_dmat);
 }
 
 /* 
  * The terms diag reset and hard reset are used interchangeably in the MPI
  * docs to mean resetting the controller chip.  In this code diag reset
  * cleans everything up, and the hard reset function just sends the reset
  * sequence to the chip.  This should probably be refactored so that every
  * subsystem gets a reset notification of some sort, and can clean up
  * appropriately.
  */
 int
 mps_reinit(struct mps_softc *sc)
 {
 	int error;
 	struct mpssas_softc *sassc;
 
 	sassc = sc->sassc;
 
 	MPS_FUNCTRACE(sc);
 
 	mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	if (sc->mps_flags & MPS_FLAGS_DIAGRESET) {
 		mps_dprint(sc, MPS_INIT, "%s reset already in progress\n",
 			   __func__);
 		return 0;
 	}
 
 	mps_dprint(sc, MPS_INFO, "Reinitializing controller,\n");
 	/* make sure the completion callbacks can recognize they're getting
 	 * a NULL cm_reply due to a reset.
 	 */
 	sc->mps_flags |= MPS_FLAGS_DIAGRESET;
 
 	/*
 	 * Mask interrupts here.
 	 */
 	mps_dprint(sc, MPS_INIT, "%s mask interrupts\n", __func__);
 	mps_mask_intr(sc);
 
 	error = mps_diag_reset(sc, CAN_SLEEP);
 	if (error != 0) {
 		/* XXXSL No need to panic here */
 		panic("%s hard reset failed with error %d\n",
 		    __func__, error);
 	}
 
 	/* Restore the PCI state, including the MSI-X registers */
 	mps_pci_restore(sc);
 
 	/* Give the I/O subsystem special priority to get itself prepared */
 	mpssas_handle_reinit(sc);
 
 	/*
 	 * Get IOC Facts and allocate all structures based on this information.
 	 * The attach function will also call mps_iocfacts_allocate at startup.
 	 * If relevant values have changed in IOC Facts, this function will free
 	 * all of the memory based on IOC Facts and reallocate that memory.
 	 */
 	if ((error = mps_iocfacts_allocate(sc, FALSE)) != 0) {
 		panic("%s IOC Facts based allocation failed with error %d\n",
 		    __func__, error);
 	}
 
 	/*
 	 * Mapping structures will be re-allocated after getting IOC Page8, so
 	 * free these structures here.
 	 */
 	mps_mapping_exit(sc);
 
 	/*
 	 * The static page function currently read is IOC Page8.  Others can be
 	 * added in future.  It's possible that the values in IOC Page8 have
 	 * changed after a Diag Reset due to user modification, so always read
 	 * these.  Interrupts are masked, so unmask them before getting config
 	 * pages.
 	 */
 	mps_unmask_intr(sc);
 	sc->mps_flags &= ~MPS_FLAGS_DIAGRESET;
 	mps_base_static_config_pages(sc);
 
 	/*
 	 * Some mapping info is based in IOC Page8 data, so re-initialize the
 	 * mapping tables.
 	 */
 	mps_mapping_initialize(sc);
 
 	/*
 	 * Restart will reload the event masks clobbered by the reset, and
 	 * then enable the port.
 	 */
 	mps_reregister_events(sc);
 
 	/* the end of discovery will release the simq, so we're done. */
 	mps_dprint(sc, MPS_INFO, "%s finished sc %p post %u free %u\n", 
 	    __func__, sc, sc->replypostindex, sc->replyfreeindex);
 
 	mpssas_release_simq_reinit(sassc);
 
 	return 0;
 }
 
 /* Wait for the chip to ACK a word that we've put into its FIFO 
  * Wait for <timeout> seconds. In single loop wait for busy loop
  * for 500 microseconds.
  * Total is [ 0.5 * (2000 * <timeout>) ] in miliseconds.
  * */
 static int
 mps_wait_db_ack(struct mps_softc *sc, int timeout, int sleep_flag)
 {
 
 	u32 cntdn, count;
 	u32 int_status;
 	u32 doorbell;
 
 	count = 0;
 	cntdn = (sleep_flag == CAN_SLEEP) ? 1000*timeout : 2000*timeout;
 	do {
 		int_status = mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
 		if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) {
 			mps_dprint(sc, MPS_INIT, 
 			"%s: successfull count(%d), timeout(%d)\n",
 			__func__, count, timeout);
 		return 0;
 		} else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
 			doorbell = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 			if ((doorbell & MPI2_IOC_STATE_MASK) ==
 				MPI2_IOC_STATE_FAULT) {
 				mps_dprint(sc, MPS_FAULT, 
 					"fault_state(0x%04x)!\n", doorbell);
 				return (EFAULT);
 			}
 		} else if (int_status == 0xFFFFFFFF)
 			goto out;
 
 		/* If it can sleep, sleep for 1 milisecond, else busy loop for 
 		* 0.5 milisecond */
 		if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
 			msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0, 
 			"mpsdba", hz/1000);
 		else if (sleep_flag == CAN_SLEEP)
 			pause("mpsdba", hz/1000);
 		else
 			DELAY(500);
 		count++;
 	} while (--cntdn);
 
 	out:
 	mps_dprint(sc, MPS_FAULT, "%s: failed due to timeout count(%d), "
 		"int_status(%x)!\n", __func__, count, int_status);
 	return (ETIMEDOUT);
 
 }
 
 /* Wait for the chip to signal that the next word in its FIFO can be fetched */
 static int
 mps_wait_db_int(struct mps_softc *sc)
 {
 	int retry;
 
 	for (retry = 0; retry < MPS_DB_MAX_WAIT; retry++) {
 		if ((mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET) &
 		    MPI2_HIS_IOC2SYS_DB_STATUS) != 0)
 			return (0);
 		DELAY(2000);
 	}
 	return (ETIMEDOUT);
 }
 
 /* Step through the synchronous command state machine, i.e. "Doorbell mode" */
 static int
 mps_request_sync(struct mps_softc *sc, void *req, MPI2_DEFAULT_REPLY *reply,
     int req_sz, int reply_sz, int timeout)
 {
 	uint32_t *data32;
 	uint16_t *data16;
 	int i, count, ioc_sz, residual;
 	int sleep_flags = CAN_SLEEP;
 
 	if (curthread->td_no_sleeping != 0)
 		sleep_flags = NO_SLEEP;
 
 	/* Step 1 */
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 
 	/* Step 2 */
 	if (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
 		return (EBUSY);
 
 	/* Step 3
 	 * Announce that a message is coming through the doorbell.  Messages
 	 * are pushed at 32bit words, so round up if needed.
 	 */
 	count = (req_sz + 3) / 4;
 	mps_regwrite(sc, MPI2_DOORBELL_OFFSET,
 	    (MPI2_FUNCTION_HANDSHAKE << MPI2_DOORBELL_FUNCTION_SHIFT) |
 	    (count << MPI2_DOORBELL_ADD_DWORDS_SHIFT));
 
 	/* Step 4 */
 	if (mps_wait_db_int(sc) ||
 	    (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) == 0) {
 		mps_dprint(sc, MPS_FAULT, "Doorbell failed to activate\n");
 		return (ENXIO);
 	}
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 	if (mps_wait_db_ack(sc, 5, sleep_flags) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Doorbell handshake failed\n");
 		return (ENXIO);
 	}
 
 	/* Step 5 */
 	/* Clock out the message data synchronously in 32-bit dwords*/
 	data32 = (uint32_t *)req;
 	for (i = 0; i < count; i++) {
 		mps_regwrite(sc, MPI2_DOORBELL_OFFSET, htole32(data32[i]));
 		if (mps_wait_db_ack(sc, 5, sleep_flags) != 0) {
 			mps_dprint(sc, MPS_FAULT,
 			    "Timeout while writing doorbell\n");
 			return (ENXIO);
 		}
 	}
 
 	/* Step 6 */
 	/* Clock in the reply in 16-bit words.  The total length of the
 	 * message is always in the 4th byte, so clock out the first 2 words
 	 * manually, then loop the rest.
 	 */
 	data16 = (uint16_t *)reply;
 	if (mps_wait_db_int(sc) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell 0\n");
 		return (ENXIO);
 	}
 	data16[0] =
 	    mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 	if (mps_wait_db_int(sc) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell 1\n");
 		return (ENXIO);
 	}
 	data16[1] =
 	    mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 
 	/* Number of 32bit words in the message */
 	ioc_sz = reply->MsgLength;
 
 	/*
 	 * Figure out how many 16bit words to clock in without overrunning.
 	 * The precision loss with dividing reply_sz can safely be
 	 * ignored because the messages can only be multiples of 32bits.
 	 */
 	residual = 0;
 	count = MIN((reply_sz / 4), ioc_sz) * 2;
 	if (count < ioc_sz * 2) {
 		residual = ioc_sz * 2 - count;
 		mps_dprint(sc, MPS_ERROR, "Driver error, throwing away %d "
 		    "residual message words\n", residual);
 	}
 
 	for (i = 2; i < count; i++) {
 		if (mps_wait_db_int(sc) != 0) {
 			mps_dprint(sc, MPS_FAULT,
 			    "Timeout reading doorbell %d\n", i);
 			return (ENXIO);
 		}
 		data16[i] = mps_regread(sc, MPI2_DOORBELL_OFFSET) &
 		    MPI2_DOORBELL_DATA_MASK;
 		mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 	}
 
 	/*
 	 * Pull out residual words that won't fit into the provided buffer.
 	 * This keeps the chip from hanging due to a driver programming
 	 * error.
 	 */
 	while (residual--) {
 		if (mps_wait_db_int(sc) != 0) {
 			mps_dprint(sc, MPS_FAULT,
 			    "Timeout reading doorbell\n");
 			return (ENXIO);
 		}
 		(void)mps_regread(sc, MPI2_DOORBELL_OFFSET);
 		mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 	}
 
 	/* Step 7 */
 	if (mps_wait_db_int(sc) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Timeout waiting to exit doorbell\n");
 		return (ENXIO);
 	}
 	if (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
 		mps_dprint(sc, MPS_FAULT, "Warning, doorbell still active\n");
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 
 	return (0);
 }
 
 static void
 mps_enqueue_request(struct mps_softc *sc, struct mps_command *cm)
 {
 	reply_descriptor rd;
 	MPS_FUNCTRACE(sc);
 	mps_dprint(sc, MPS_TRACE, "SMID %u cm %p ccb %p\n",
 	    cm->cm_desc.Default.SMID, cm, cm->cm_ccb);
 
 	if (sc->mps_flags & MPS_FLAGS_ATTACH_DONE && !(sc->mps_flags & MPS_FLAGS_SHUTDOWN))
 		mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	if (++sc->io_cmds_active > sc->io_cmds_highwater)
 		sc->io_cmds_highwater++;
 	rd.u.low = cm->cm_desc.Words.Low;
 	rd.u.high = cm->cm_desc.Words.High;
 	rd.word = htole64(rd.word);
 	/* TODO-We may need to make below regwrite atomic */
 	mps_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET,
 	    rd.u.low);
 	mps_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_HIGH_OFFSET,
 	    rd.u.high);
 }
 
 /*
  * Just the FACTS, ma'am.
  */
 static int
 mps_get_iocfacts(struct mps_softc *sc, MPI2_IOC_FACTS_REPLY *facts)
 {
 	MPI2_DEFAULT_REPLY *reply;
 	MPI2_IOC_FACTS_REQUEST request;
 	int error, req_sz, reply_sz;
 
 	MPS_FUNCTRACE(sc);
 
 	req_sz = sizeof(MPI2_IOC_FACTS_REQUEST);
 	reply_sz = sizeof(MPI2_IOC_FACTS_REPLY);
 	reply = (MPI2_DEFAULT_REPLY *)facts;
 
 	bzero(&request, req_sz);
 	request.Function = MPI2_FUNCTION_IOC_FACTS;
 	error = mps_request_sync(sc, &request, reply, req_sz, reply_sz, 5);
 
 	return (error);
 }
 
 static int
 mps_send_iocinit(struct mps_softc *sc)
 {
 	MPI2_IOC_INIT_REQUEST	init;
 	MPI2_DEFAULT_REPLY	reply;
 	int req_sz, reply_sz, error;
 	struct timeval now;
 	uint64_t time_in_msec;
 
 	MPS_FUNCTRACE(sc);
 
 	req_sz = sizeof(MPI2_IOC_INIT_REQUEST);
 	reply_sz = sizeof(MPI2_IOC_INIT_REPLY);
 	bzero(&init, req_sz);
 	bzero(&reply, reply_sz);
 
 	/*
 	 * Fill in the init block.  Note that most addresses are
 	 * deliberately in the lower 32bits of memory.  This is a micro-
 	 * optimzation for PCI/PCIX, though it's not clear if it helps PCIe.
 	 */
 	init.Function = MPI2_FUNCTION_IOC_INIT;
 	init.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
 	init.MsgVersion = htole16(MPI2_VERSION);
 	init.HeaderVersion = htole16(MPI2_HEADER_VERSION);
 	init.SystemRequestFrameSize = htole16(sc->facts->IOCRequestFrameSize);
 	init.ReplyDescriptorPostQueueDepth = htole16(sc->pqdepth);
 	init.ReplyFreeQueueDepth = htole16(sc->fqdepth);
 	init.SenseBufferAddressHigh = 0;
 	init.SystemReplyAddressHigh = 0;
 	init.SystemRequestFrameBaseAddress.High = 0;
 	init.SystemRequestFrameBaseAddress.Low = htole32((uint32_t)sc->req_busaddr);
 	init.ReplyDescriptorPostQueueAddress.High = 0;
 	init.ReplyDescriptorPostQueueAddress.Low = htole32((uint32_t)sc->post_busaddr);
 	init.ReplyFreeQueueAddress.High = 0;
 	init.ReplyFreeQueueAddress.Low = htole32((uint32_t)sc->free_busaddr);
 	getmicrotime(&now);
 	time_in_msec = (now.tv_sec * 1000 + now.tv_usec/1000);
 	init.TimeStamp.High = htole32((time_in_msec >> 32) & 0xFFFFFFFF);
 	init.TimeStamp.Low = htole32(time_in_msec & 0xFFFFFFFF);
 
 	error = mps_request_sync(sc, &init, &reply, req_sz, reply_sz, 5);
 	if ((reply.IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
 		error = ENXIO;
 
 	mps_dprint(sc, MPS_INIT, "IOCInit status= 0x%x\n", reply.IOCStatus);
 	return (error);
 }
 
 void
 mps_memaddr_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 {
 	bus_addr_t *addr;
 
 	addr = arg;
 	*addr = segs[0].ds_addr;
 }
 
 static int
 mps_alloc_queues(struct mps_softc *sc)
 {
 	bus_addr_t queues_busaddr;
 	uint8_t *queues;
 	int qsize, fqsize, pqsize;
 
 	/*
 	 * The reply free queue contains 4 byte entries in multiples of 16 and
 	 * aligned on a 16 byte boundary. There must always be an unused entry.
 	 * This queue supplies fresh reply frames for the firmware to use.
 	 *
 	 * The reply descriptor post queue contains 8 byte entries in
 	 * multiples of 16 and aligned on a 16 byte boundary.  This queue
 	 * contains filled-in reply frames sent from the firmware to the host.
 	 *
 	 * These two queues are allocated together for simplicity.
 	 */
 	sc->fqdepth = roundup2((sc->num_replies + 1), 16);
 	sc->pqdepth = roundup2((sc->num_replies + 1), 16);
 	fqsize= sc->fqdepth * 4;
 	pqsize = sc->pqdepth * 8;
 	qsize = fqsize + pqsize;
 
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				16, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 qsize,			/* maxsize */
                                 1,			/* nsegments */
                                 qsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->queues_dmat)) {
 		device_printf(sc->mps_dev, "Cannot allocate queues DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->queues_dmat, (void **)&queues, BUS_DMA_NOWAIT,
 	    &sc->queues_map)) {
 		device_printf(sc->mps_dev, "Cannot allocate queues memory\n");
 		return (ENOMEM);
         }
         bzero(queues, qsize);
         bus_dmamap_load(sc->queues_dmat, sc->queues_map, queues, qsize,
 	    mps_memaddr_cb, &queues_busaddr, 0);
 
 	sc->free_queue = (uint32_t *)queues;
 	sc->free_busaddr = queues_busaddr;
 	sc->post_queue = (MPI2_REPLY_DESCRIPTORS_UNION *)(queues + fqsize);
 	sc->post_busaddr = queues_busaddr + fqsize;
 
 	return (0);
 }
 
 static int
 mps_alloc_replies(struct mps_softc *sc)
 {
 	int rsize, num_replies;
 
 	/*
 	 * sc->num_replies should be one less than sc->fqdepth.  We need to
 	 * allocate space for sc->fqdepth replies, but only sc->num_replies
 	 * replies can be used at once.
 	 */
 	num_replies = max(sc->fqdepth, sc->num_replies);
 
 	rsize = sc->facts->ReplyFrameSize * num_replies * 4; 
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				4, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 rsize,			/* maxsize */
                                 1,			/* nsegments */
                                 rsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->reply_dmat)) {
 		device_printf(sc->mps_dev, "Cannot allocate replies DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->reply_dmat, (void **)&sc->reply_frames,
 	    BUS_DMA_NOWAIT, &sc->reply_map)) {
 		device_printf(sc->mps_dev, "Cannot allocate replies memory\n");
 		return (ENOMEM);
         }
         bzero(sc->reply_frames, rsize);
         bus_dmamap_load(sc->reply_dmat, sc->reply_map, sc->reply_frames, rsize,
 	    mps_memaddr_cb, &sc->reply_busaddr, 0);
 
 	return (0);
 }
 
 static int
 mps_alloc_requests(struct mps_softc *sc)
 {
 	struct mps_command *cm;
 	struct mps_chain *chain;
 	int i, rsize, nsegs;
 
 	rsize = sc->facts->IOCRequestFrameSize * sc->num_reqs * 4;
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				16, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 rsize,			/* maxsize */
                                 1,			/* nsegments */
                                 rsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->req_dmat)) {
 		device_printf(sc->mps_dev, "Cannot allocate request DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->req_dmat, (void **)&sc->req_frames,
 	    BUS_DMA_NOWAIT, &sc->req_map)) {
 		device_printf(sc->mps_dev, "Cannot allocate request memory\n");
 		return (ENOMEM);
         }
         bzero(sc->req_frames, rsize);
         bus_dmamap_load(sc->req_dmat, sc->req_map, sc->req_frames, rsize,
 	    mps_memaddr_cb, &sc->req_busaddr, 0);
 
 	rsize = sc->facts->IOCRequestFrameSize * sc->max_chains * 4;
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				16, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 rsize,			/* maxsize */
                                 1,			/* nsegments */
                                 rsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->chain_dmat)) {
 		device_printf(sc->mps_dev, "Cannot allocate chain DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->chain_dmat, (void **)&sc->chain_frames,
 	    BUS_DMA_NOWAIT, &sc->chain_map)) {
 		device_printf(sc->mps_dev, "Cannot allocate chain memory\n");
 		return (ENOMEM);
         }
         bzero(sc->chain_frames, rsize);
         bus_dmamap_load(sc->chain_dmat, sc->chain_map, sc->chain_frames, rsize,
 	    mps_memaddr_cb, &sc->chain_busaddr, 0);
 
 	rsize = MPS_SENSE_LEN * sc->num_reqs;
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				1, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 rsize,			/* maxsize */
                                 1,			/* nsegments */
                                 rsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->sense_dmat)) {
 		device_printf(sc->mps_dev, "Cannot allocate sense DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->sense_dmat, (void **)&sc->sense_frames,
 	    BUS_DMA_NOWAIT, &sc->sense_map)) {
 		device_printf(sc->mps_dev, "Cannot allocate sense memory\n");
 		return (ENOMEM);
         }
         bzero(sc->sense_frames, rsize);
         bus_dmamap_load(sc->sense_dmat, sc->sense_map, sc->sense_frames, rsize,
 	    mps_memaddr_cb, &sc->sense_busaddr, 0);
 
 	sc->chains = malloc(sizeof(struct mps_chain) * sc->max_chains, M_MPT2,
 	    M_WAITOK | M_ZERO);
 	if(!sc->chains) {
 		device_printf(sc->mps_dev, 
 		"Cannot allocate chains memory %s %d\n",
 		 __func__, __LINE__);
 		return (ENOMEM);
 	}
 	for (i = 0; i < sc->max_chains; i++) {
 		chain = &sc->chains[i];
 		chain->chain = (MPI2_SGE_IO_UNION *)(sc->chain_frames +
 		    i * sc->facts->IOCRequestFrameSize * 4);
 		chain->chain_busaddr = sc->chain_busaddr +
 		    i * sc->facts->IOCRequestFrameSize * 4;
 		mps_free_chain(sc, chain);
 		sc->chain_free_lowwater++;
 	}
 
 	/* XXX Need to pick a more precise value */
 	nsegs = (MAXPHYS / PAGE_SIZE) + 1;
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				1, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR,	/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
                                 nsegs,			/* nsegments */
                                 BUS_SPACE_MAXSIZE_24BIT,/* maxsegsize */
                                 BUS_DMA_ALLOCNOW,	/* flags */
                                 busdma_lock_mutex,	/* lockfunc */
 				&sc->mps_mtx,		/* lockarg */
                                 &sc->buffer_dmat)) {
 		device_printf(sc->mps_dev, "Cannot allocate buffer DMA tag\n");
 		return (ENOMEM);
         }
 
 	/*
 	 * SMID 0 cannot be used as a free command per the firmware spec.
 	 * Just drop that command instead of risking accounting bugs.
 	 */
 	sc->commands = malloc(sizeof(struct mps_command) * sc->num_reqs,
 	    M_MPT2, M_WAITOK | M_ZERO);
 	if(!sc->commands) {
 		device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n",
 		 __func__, __LINE__);
 		return (ENOMEM);
 	}
 	for (i = 1; i < sc->num_reqs; i++) {
 		cm = &sc->commands[i];
 		cm->cm_req = sc->req_frames +
 		    i * sc->facts->IOCRequestFrameSize * 4;
 		cm->cm_req_busaddr = sc->req_busaddr +
 		    i * sc->facts->IOCRequestFrameSize * 4;
 		cm->cm_sense = &sc->sense_frames[i];
 		cm->cm_sense_busaddr = sc->sense_busaddr + i * MPS_SENSE_LEN;
 		cm->cm_desc.Default.SMID = i;
 		cm->cm_sc = sc;
 		TAILQ_INIT(&cm->cm_chain_list);
 		callout_init_mtx(&cm->cm_callout, &sc->mps_mtx, 0);
 
 		/* XXX Is a failure here a critical problem? */
 		if (bus_dmamap_create(sc->buffer_dmat, 0, &cm->cm_dmamap) == 0)
 			if (i <= sc->facts->HighPriorityCredit)
 				mps_free_high_priority_command(sc, cm);
 			else
 				mps_free_command(sc, cm);
 		else {
 			panic("failed to allocate command %d\n", i);
 			sc->num_reqs = i;
 			break;
 		}
 	}
 
 	return (0);
 }
 
 static int
 mps_init_queues(struct mps_softc *sc)
 {
 	int i;
 
 	memset((uint8_t *)sc->post_queue, 0xff, sc->pqdepth * 8);
 
 	/*
 	 * According to the spec, we need to use one less reply than we
 	 * have space for on the queue.  So sc->num_replies (the number we
 	 * use) should be less than sc->fqdepth (allocated size).
 	 */
 	if (sc->num_replies >= sc->fqdepth)
 		return (EINVAL);
 
 	/*
 	 * Initialize all of the free queue entries.
 	 */
 	for (i = 0; i < sc->fqdepth; i++)
 		sc->free_queue[i] = sc->reply_busaddr + (i * sc->facts->ReplyFrameSize * 4);
 	sc->replyfreeindex = sc->num_replies;
 
 	return (0);
 }
 
 /* Get the driver parameter tunables.  Lowest priority are the driver defaults.
  * Next are the global settings, if they exist.  Highest are the per-unit
  * settings, if they exist.
  */
 static void
 mps_get_tunables(struct mps_softc *sc)
 {
 	char tmpstr[80];
 
 	/* XXX default to some debugging for now */
 	sc->mps_debug = MPS_INFO|MPS_FAULT;
 	sc->disable_msix = 0;
 	sc->disable_msi = 0;
 	sc->max_chains = MPS_CHAIN_FRAMES;
 
 	/*
 	 * Grab the global variables.
 	 */
 	TUNABLE_INT_FETCH("hw.mps.debug_level", &sc->mps_debug);
 	TUNABLE_INT_FETCH("hw.mps.disable_msix", &sc->disable_msix);
 	TUNABLE_INT_FETCH("hw.mps.disable_msi", &sc->disable_msi);
 	TUNABLE_INT_FETCH("hw.mps.max_chains", &sc->max_chains);
 
 	/* Grab the unit-instance variables */
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.debug_level",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->mps_debug);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.disable_msix",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msix);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.disable_msi",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msi);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_chains",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->max_chains);
 
 	bzero(sc->exclude_ids, sizeof(sc->exclude_ids));
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.exclude_ids",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_STR_FETCH(tmpstr, sc->exclude_ids, sizeof(sc->exclude_ids));
 }
 
 static void
 mps_setup_sysctl(struct mps_softc *sc)
 {
 	struct sysctl_ctx_list	*sysctl_ctx = NULL;
 	struct sysctl_oid	*sysctl_tree = NULL;
 	char tmpstr[80], tmpstr2[80];
 
 	/*
 	 * Setup the sysctl variable so the user can change the debug level
 	 * on the fly.
 	 */
 	snprintf(tmpstr, sizeof(tmpstr), "MPS controller %d",
 	    device_get_unit(sc->mps_dev));
 	snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mps_dev));
 
 	sysctl_ctx = device_get_sysctl_ctx(sc->mps_dev);
 	if (sysctl_ctx != NULL)
 		sysctl_tree = device_get_sysctl_tree(sc->mps_dev);
 
 	if (sysctl_tree == NULL) {
 		sysctl_ctx_init(&sc->sysctl_ctx);
 		sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_hw_mps), OID_AUTO, tmpstr2,
 		    CTLFLAG_RD, 0, tmpstr);
 		if (sc->sysctl_tree == NULL)
 			return;
 		sysctl_ctx = &sc->sysctl_ctx;
 		sysctl_tree = sc->sysctl_tree;
 	}
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "debug_level", CTLFLAG_RW, &sc->mps_debug, 0,
 	    "mps debug level");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "disable_msix", CTLFLAG_RD, &sc->disable_msix, 0,
 	    "Disable the use of MSI-X interrupts");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "disable_msi", CTLFLAG_RD, &sc->disable_msi, 0,
 	    "Disable the use of MSI interrupts");
 
 	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "firmware_version", CTLFLAG_RW, sc->fw_version,
 	    strlen(sc->fw_version), "firmware version");
 
 	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "driver_version", CTLFLAG_RW, MPS_DRIVER_VERSION,
 	    strlen(MPS_DRIVER_VERSION), "driver version");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "io_cmds_active", CTLFLAG_RD,
 	    &sc->io_cmds_active, 0, "number of currently active commands");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "io_cmds_highwater", CTLFLAG_RD,
 	    &sc->io_cmds_highwater, 0, "maximum active commands seen");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "chain_free", CTLFLAG_RD,
 	    &sc->chain_free, 0, "number of free chain elements");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "chain_free_lowwater", CTLFLAG_RD,
 	    &sc->chain_free_lowwater, 0,"lowest number of free chain elements");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "max_chains", CTLFLAG_RD,
 	    &sc->max_chains, 0,"maximum chain frames that will be allocated");
 
 #if __FreeBSD_version >= 900030
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "chain_alloc_fail", CTLFLAG_RD,
 	    &sc->chain_alloc_fail, "chain allocation failures");
 #endif //FreeBSD_version >= 900030
 }
 
 int
 mps_attach(struct mps_softc *sc)
 {
 	int error;
 
 	mps_get_tunables(sc);
 
 	MPS_FUNCTRACE(sc);
 
 	mtx_init(&sc->mps_mtx, "MPT2SAS lock", NULL, MTX_DEF);
 	callout_init_mtx(&sc->periodic, &sc->mps_mtx, 0);
 	TAILQ_INIT(&sc->event_list);
 	timevalclear(&sc->lastfail);
 
 	if ((error = mps_transition_ready(sc)) != 0) {
 		mps_printf(sc, "%s failed to transition ready\n", __func__);
 		return (error);
 	}
 
 	sc->facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY), M_MPT2,
 	    M_ZERO|M_NOWAIT);
 	if(!sc->facts) {
 		device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n",
 		 __func__, __LINE__);
 		return (ENOMEM);
 	}
 
 	/*
 	 * Get IOC Facts and allocate all structures based on this information.
 	 * A Diag Reset will also call mps_iocfacts_allocate and re-read the IOC
 	 * Facts. If relevant values have changed in IOC Facts, this function
 	 * will free all of the memory based on IOC Facts and reallocate that
 	 * memory.  If this fails, any allocated memory should already be freed.
 	 */
 	if ((error = mps_iocfacts_allocate(sc, TRUE)) != 0) {
 		mps_dprint(sc, MPS_FAULT, "%s IOC Facts based allocation "
 		    "failed with error %d\n", __func__, error);
 		return (error);
 	}
 
 	/* Start the periodic watchdog check on the IOC Doorbell */
 	mps_periodic(sc);
 
 	/*
 	 * The portenable will kick off discovery events that will drive the
 	 * rest of the initialization process.  The CAM/SAS module will
 	 * hold up the boot sequence until discovery is complete.
 	 */
 	sc->mps_ich.ich_func = mps_startup;
 	sc->mps_ich.ich_arg = sc;
 	if (config_intrhook_establish(&sc->mps_ich) != 0) {
 		mps_dprint(sc, MPS_ERROR, "Cannot establish MPS config hook\n");
 		error = EINVAL;
 	}
 
 	/*
 	 * Allow IR to shutdown gracefully when shutdown occurs.
 	 */
 	sc->shutdown_eh = EVENTHANDLER_REGISTER(shutdown_final,
 	    mpssas_ir_shutdown, sc, SHUTDOWN_PRI_DEFAULT);
 
 	if (sc->shutdown_eh == NULL)
 		mps_dprint(sc, MPS_ERROR, "shutdown event registration "
 		    "failed\n");
 
 	mps_setup_sysctl(sc);
 
 	sc->mps_flags |= MPS_FLAGS_ATTACH_DONE;
 
 	return (error);
 }
 
 /* Run through any late-start handlers. */
 static void
 mps_startup(void *arg)
 {
 	struct mps_softc *sc;
 
 	sc = (struct mps_softc *)arg;
 
 	mps_lock(sc);
 	mps_unmask_intr(sc);
 
 	/* initialize device mapping tables */
 	mps_base_static_config_pages(sc);
 	mps_mapping_initialize(sc);
 	mpssas_startup(sc);
 	mps_unlock(sc);
 }
 
 /* Periodic watchdog.  Is called with the driver lock already held. */
 static void
 mps_periodic(void *arg)
 {
 	struct mps_softc *sc;
 	uint32_t db;
 
 	sc = (struct mps_softc *)arg;
 	if (sc->mps_flags & MPS_FLAGS_SHUTDOWN)
 		return;
 
 	db = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 	if ((db & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
 		mps_dprint(sc, MPS_FAULT, "IOC Fault 0x%08x, Resetting\n", db);
 		mps_reinit(sc);
 	}
 
 	callout_reset(&sc->periodic, MPS_PERIODIC_DELAY * hz, mps_periodic, sc);
 }
 
 static void
 mps_log_evt_handler(struct mps_softc *sc, uintptr_t data,
     MPI2_EVENT_NOTIFICATION_REPLY *event)
 {
 	MPI2_EVENT_DATA_LOG_ENTRY_ADDED *entry;
 
 	mps_print_event(sc, event);
 
 	switch (event->Event) {
 	case MPI2_EVENT_LOG_DATA:
 		mps_dprint(sc, MPS_EVENT, "MPI2_EVENT_LOG_DATA:\n");
 		if (sc->mps_debug & MPS_EVENT)
 			hexdump(event->EventData, event->EventDataLength, NULL, 0);
 		break;
 	case MPI2_EVENT_LOG_ENTRY_ADDED:
 		entry = (MPI2_EVENT_DATA_LOG_ENTRY_ADDED *)event->EventData;
 		mps_dprint(sc, MPS_EVENT, "MPI2_EVENT_LOG_ENTRY_ADDED event "
 		    "0x%x Sequence %d:\n", entry->LogEntryQualifier,
 		     entry->LogSequence);
 		break;
 	default:
 		break;
 	}
 	return;
 }
 
 static int
 mps_attach_log(struct mps_softc *sc)
 {
 	u32 events[MPI2_EVENT_NOTIFY_EVENTMASK_WORDS];
 
 	bzero(events, 16);
 	setbit(events, MPI2_EVENT_LOG_DATA);
 	setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED);
 
 	mps_register_events(sc, events, mps_log_evt_handler, NULL,
 	    &sc->mps_log_eh);
 
 	return (0);
 }
 
 static int
 mps_detach_log(struct mps_softc *sc)
 {
 
 	if (sc->mps_log_eh != NULL)
 		mps_deregister_events(sc, sc->mps_log_eh);
 	return (0);
 }
 
 /*
  * Free all of the driver resources and detach submodules.  Should be called
  * without the lock held.
  */
 int
 mps_free(struct mps_softc *sc)
 {
 	int error;
 
 	/* Turn off the watchdog */
 	mps_lock(sc);
 	sc->mps_flags |= MPS_FLAGS_SHUTDOWN;
 	mps_unlock(sc);
 	/* Lock must not be held for this */
 	callout_drain(&sc->periodic);
 
 	if (((error = mps_detach_log(sc)) != 0) ||
 	    ((error = mps_detach_sas(sc)) != 0))
 		return (error);
 
 	mps_detach_user(sc);
 
 	/* Put the IOC back in the READY state. */
 	mps_lock(sc);
 	if ((error = mps_transition_ready(sc)) != 0) {
 		mps_unlock(sc);
 		return (error);
 	}
 	mps_unlock(sc);
 
 	if (sc->facts != NULL)
 		free(sc->facts, M_MPT2);
 
 	/*
 	 * Free all buffers that are based on IOC Facts.  A Diag Reset may need
 	 * to free these buffers too.
 	 */
 	mps_iocfacts_free(sc);
 
 	if (sc->sysctl_tree != NULL)
 		sysctl_ctx_free(&sc->sysctl_ctx);
 
 	/* Deregister the shutdown function */
 	if (sc->shutdown_eh != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_final, sc->shutdown_eh);
 
 	mtx_destroy(&sc->mps_mtx);
 
 	return (0);
 }
 
 static __inline void
 mps_complete_command(struct mps_softc *sc, struct mps_command *cm)
 {
 	MPS_FUNCTRACE(sc);
 
 	if (cm == NULL) {
 		mps_dprint(sc, MPS_ERROR, "Completing NULL command\n");
 		return;
 	}
 
 	if (cm->cm_flags & MPS_CM_FLAGS_POLLED)
 		cm->cm_flags |= MPS_CM_FLAGS_COMPLETE;
 
 	if (cm->cm_complete != NULL) {
 		mps_dprint(sc, MPS_TRACE,
 			   "%s cm %p calling cm_complete %p data %p reply %p\n",
 			   __func__, cm, cm->cm_complete, cm->cm_complete_data,
 			   cm->cm_reply);
 		cm->cm_complete(sc, cm);
 	}
 
 	if (cm->cm_flags & MPS_CM_FLAGS_WAKEUP) {
 		mps_dprint(sc, MPS_TRACE, "waking up %p\n", cm);
 		wakeup(cm);
 	}
 
 	if (cm->cm_sc->io_cmds_active != 0) {
 		cm->cm_sc->io_cmds_active--;
 	} else {
 		mps_dprint(sc, MPS_ERROR, "Warning: io_cmds_active is "
 		    "out of sync - resynching to 0\n");
 	}
 }
 
 
 static void
 mps_sas_log_info(struct mps_softc *sc , u32 log_info)
 {
 	union loginfo_type {
 		u32     loginfo;
 		struct {
 			u32     subcode:16;
 			u32     code:8;
 			u32     originator:4;
 			u32     bus_type:4;
 		} dw;
 	};
 	union loginfo_type sas_loginfo;
 	char *originator_str = NULL;
 
 	sas_loginfo.loginfo = log_info;
 	if (sas_loginfo.dw.bus_type != 3 /*SAS*/)
 		return;
 
 	/* each nexus loss loginfo */
 	if (log_info == 0x31170000)
 		return;
 
 	/* eat the loginfos associated with task aborts */
 	if ((log_info == 30050000 || log_info ==
 	    0x31140000 || log_info == 0x31130000))
 		return;
 
 	switch (sas_loginfo.dw.originator) {
 	case 0:
 		originator_str = "IOP";
 		break;
 	case 1:
 		originator_str = "PL";
 		break;
 	case 2:
 		originator_str = "IR";
 		break;
 }
 
 	mps_dprint(sc, MPS_LOG, "log_info(0x%08x): originator(%s), "
 	"code(0x%02x), sub_code(0x%04x)\n", log_info,
 	originator_str, sas_loginfo.dw.code,
 	sas_loginfo.dw.subcode);
 }
 
 static void
 mps_display_reply_info(struct mps_softc *sc, uint8_t *reply)
 {
 	MPI2DefaultReply_t *mpi_reply;
 	u16 sc_status;
 
 	mpi_reply = (MPI2DefaultReply_t*)reply;
 	sc_status = le16toh(mpi_reply->IOCStatus);
 	if (sc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
 		mps_sas_log_info(sc, le32toh(mpi_reply->IOCLogInfo));
 }
 void
 mps_intr(void *data)
 {
 	struct mps_softc *sc;
 	uint32_t status;
 
 	sc = (struct mps_softc *)data;
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	/*
 	 * Check interrupt status register to flush the bus.  This is
 	 * needed for both INTx interrupts and driver-driven polling
 	 */
 	status = mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
 	if ((status & MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT) == 0)
 		return;
 
 	mps_lock(sc);
 	mps_intr_locked(data);
 	mps_unlock(sc);
 	return;
 }
 
 /*
  * In theory, MSI/MSIX interrupts shouldn't need to read any registers on the
  * chip.  Hopefully this theory is correct.
  */
 void
 mps_intr_msi(void *data)
 {
 	struct mps_softc *sc;
 
 	sc = (struct mps_softc *)data;
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 	mps_lock(sc);
 	mps_intr_locked(data);
 	mps_unlock(sc);
 	return;
 }
 
 /*
  * The locking is overly broad and simplistic, but easy to deal with for now.
  */
 void
 mps_intr_locked(void *data)
 {
 	MPI2_REPLY_DESCRIPTORS_UNION *desc;
 	struct mps_softc *sc;
 	struct mps_command *cm = NULL;
 	uint8_t flags;
 	u_int pq;
 	MPI2_DIAG_RELEASE_REPLY *rel_rep;
 	mps_fw_diagnostic_buffer_t *pBuffer;
 
 	sc = (struct mps_softc *)data;
 
 	pq = sc->replypostindex;
 	mps_dprint(sc, MPS_TRACE,
 	    "%s sc %p starting with replypostindex %u\n", 
 	    __func__, sc, sc->replypostindex);
 
 	for ( ;; ) {
 		cm = NULL;
 		desc = &sc->post_queue[sc->replypostindex];
 		flags = desc->Default.ReplyFlags &
 		    MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
 		if ((flags == MPI2_RPY_DESCRIPT_FLAGS_UNUSED)
 		 || (le32toh(desc->Words.High) == 0xffffffff))
 			break;
 
 		/* increment the replypostindex now, so that event handlers
 		 * and cm completion handlers which decide to do a diag
 		 * reset can zero it without it getting incremented again
 		 * afterwards, and we break out of this loop on the next
 		 * iteration since the reply post queue has been cleared to
 		 * 0xFF and all descriptors look unused (which they are).
 		 */
 		if (++sc->replypostindex >= sc->pqdepth)
 			sc->replypostindex = 0;
 
 		switch (flags) {
 		case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS:
 			cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
 			cm->cm_reply = NULL;
 			break;
 		case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:
 		{
 			uint32_t baddr;
 			uint8_t *reply;
 
 			/*
 			 * Re-compose the reply address from the address
 			 * sent back from the chip.  The ReplyFrameAddress
 			 * is the lower 32 bits of the physical address of
 			 * particular reply frame.  Convert that address to
 			 * host format, and then use that to provide the
 			 * offset against the virtual address base
 			 * (sc->reply_frames).
 			 */
 			baddr = le32toh(desc->AddressReply.ReplyFrameAddress);
 			reply = sc->reply_frames +
 				(baddr - ((uint32_t)sc->reply_busaddr));
 			/*
 			 * Make sure the reply we got back is in a valid
 			 * range.  If not, go ahead and panic here, since
 			 * we'll probably panic as soon as we deference the
 			 * reply pointer anyway.
 			 */
 			if ((reply < sc->reply_frames)
 			 || (reply > (sc->reply_frames +
 			     (sc->fqdepth * sc->facts->ReplyFrameSize * 4)))) {
 				printf("%s: WARNING: reply %p out of range!\n",
 				       __func__, reply);
 				printf("%s: reply_frames %p, fqdepth %d, "
 				       "frame size %d\n", __func__,
 				       sc->reply_frames, sc->fqdepth,
 				       sc->facts->ReplyFrameSize * 4);
 				printf("%s: baddr %#x,\n", __func__, baddr);
 				/* LSI-TODO. See Linux Code. Need Gracefull exit*/
 				panic("Reply address out of range");
 			}
 			if (le16toh(desc->AddressReply.SMID) == 0) {
 				if (((MPI2_DEFAULT_REPLY *)reply)->Function ==
 				    MPI2_FUNCTION_DIAG_BUFFER_POST) {
 					/*
 					 * If SMID is 0 for Diag Buffer Post,
 					 * this implies that the reply is due to
 					 * a release function with a status that
 					 * the buffer has been released.  Set
 					 * the buffer flags accordingly.
 					 */
 					rel_rep =
 					    (MPI2_DIAG_RELEASE_REPLY *)reply;
 					if (le16toh(rel_rep->IOCStatus) ==
 					    MPI2_IOCSTATUS_DIAGNOSTIC_RELEASED)
 					    {
 						pBuffer =
 						    &sc->fw_diag_buffer_list[
 						    rel_rep->BufferType];
 						pBuffer->valid_data = TRUE;
 						pBuffer->owned_by_firmware =
 						    FALSE;
 						pBuffer->immediate = FALSE;
 					}
 				} else
 					mps_dispatch_event(sc, baddr,
 					    (MPI2_EVENT_NOTIFICATION_REPLY *)
 					    reply);
 			} else {
 				cm = &sc->commands[le16toh(desc->AddressReply.SMID)];
 				cm->cm_reply = reply;
 				cm->cm_reply_data =
 				    le32toh(desc->AddressReply.ReplyFrameAddress);
 			}
 			break;
 		}
 		case MPI2_RPY_DESCRIPT_FLAGS_TARGETASSIST_SUCCESS:
 		case MPI2_RPY_DESCRIPT_FLAGS_TARGET_COMMAND_BUFFER:
 		case MPI2_RPY_DESCRIPT_FLAGS_RAID_ACCELERATOR_SUCCESS:
 		default:
 			/* Unhandled */
 			mps_dprint(sc, MPS_ERROR, "Unhandled reply 0x%x\n",
 			    desc->Default.ReplyFlags);
 			cm = NULL;
 			break;
 		}
 		
 
 		if (cm != NULL) {
 			// Print Error reply frame
 			if (cm->cm_reply)
 				mps_display_reply_info(sc,cm->cm_reply);
 			mps_complete_command(sc, cm);
 		}
 
 		desc->Words.Low = 0xffffffff;
 		desc->Words.High = 0xffffffff;
 	}
 
 	if (pq != sc->replypostindex) {
 		mps_dprint(sc, MPS_TRACE,
 		    "%s sc %p writing postindex %d\n",
 		    __func__, sc, sc->replypostindex);
 		mps_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, sc->replypostindex);
 	}
 
 	return;
 }
 
 static void
 mps_dispatch_event(struct mps_softc *sc, uintptr_t data,
     MPI2_EVENT_NOTIFICATION_REPLY *reply)
 {
 	struct mps_event_handle *eh;
 	int event, handled = 0;
 
 	event = le16toh(reply->Event);
 	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
 		if (isset(eh->mask, event)) {
 			eh->callback(sc, data, reply);
 			handled++;
 		}
 	}
 
 	if (handled == 0)
 		mps_dprint(sc, MPS_EVENT, "Unhandled event 0x%x\n", le16toh(event));
 
 	/*
 	 * This is the only place that the event/reply should be freed.
 	 * Anything wanting to hold onto the event data should have
 	 * already copied it into their own storage.
 	 */
 	mps_free_reply(sc, data);
 }
 
 static void
 mps_reregister_events_complete(struct mps_softc *sc, struct mps_command *cm)
 {
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	if (cm->cm_reply)
 		mps_print_event(sc,
 			(MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply);
 
 	mps_free_command(sc, cm);
 
 	/* next, send a port enable */
 	mpssas_startup(sc);
 }
 
 /*
  * For both register_events and update_events, the caller supplies a bitmap
  * of events that it _wants_.  These functions then turn that into a bitmask
  * suitable for the controller.
  */
 int
 mps_register_events(struct mps_softc *sc, u32 *mask,
     mps_evt_callback_t *cb, void *data, struct mps_event_handle **handle)
 {
 	struct mps_event_handle *eh;
 	int error = 0;
 
 	eh = malloc(sizeof(struct mps_event_handle), M_MPT2, M_WAITOK|M_ZERO);
 	if(!eh) {
 		device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n",
 		 __func__, __LINE__);
 		return (ENOMEM);
 	}
 	eh->callback = cb;
 	eh->data = data;
 	TAILQ_INSERT_TAIL(&sc->event_list, eh, eh_list);
 	if (mask != NULL)
 		error = mps_update_events(sc, eh, mask);
 	*handle = eh;
 
 	return (error);
 }
 
 int
 mps_update_events(struct mps_softc *sc, struct mps_event_handle *handle,
     u32 *mask)
 {
 	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
 	MPI2_EVENT_NOTIFICATION_REPLY *reply;
 	struct mps_command *cm;
 	int error, i;
 
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	if ((mask != NULL) && (handle != NULL))
 		bcopy(mask, &handle->mask[0], sizeof(u32) * 
 				MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);
     
 	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 		sc->event_mask[i] = -1;
 
 	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 		sc->event_mask[i] &= ~handle->mask[i];
 
 
 	if ((cm = mps_alloc_command(sc)) == NULL)
 		return (EBUSY);
 	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
 	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
 	evtreq->MsgFlags = 0;
 	evtreq->SASBroadcastPrimitiveMasks = 0;
 #ifdef MPS_DEBUG_ALL_EVENTS
 	{
 		u_char fullmask[16];
 		memset(fullmask, 0x00, 16);
 		bcopy(fullmask, &evtreq->EventMasks[0], sizeof(u32) * 
 				MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);
 	}
 #else
         for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
                 evtreq->EventMasks[i] =
                     htole32(sc->event_mask[i]);
 #endif
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	cm->cm_data = NULL;
 
 	error = mps_request_polled(sc, cm);
 	reply = (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply;
 	if ((reply == NULL) ||
 	    (reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
 		error = ENXIO;
 	mps_print_event(sc, reply);
 	mps_dprint(sc, MPS_TRACE, "%s finished error %d\n", __func__, error);
 
 	mps_free_command(sc, cm);
 	return (error);
 }
 
 static int
 mps_reregister_events(struct mps_softc *sc)
 {
 	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
 	struct mps_command *cm;
 	struct mps_event_handle *eh;
 	int error, i;
 
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	/* first, reregister events */
 
 	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 		sc->event_mask[i] = -1;
 
 	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
 		for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 			sc->event_mask[i] &= ~eh->mask[i];
 	}
 
 	if ((cm = mps_alloc_command(sc)) == NULL)
 		return (EBUSY);
 	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
 	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
 	evtreq->MsgFlags = 0;
 	evtreq->SASBroadcastPrimitiveMasks = 0;
 #ifdef MPS_DEBUG_ALL_EVENTS
 	{
 		u_char fullmask[16];
 		memset(fullmask, 0x00, 16);
 		bcopy(fullmask, &evtreq->EventMasks[0], sizeof(u32) *
 			MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);
 	}
 #else
         for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
                 evtreq->EventMasks[i] =
                     htole32(sc->event_mask[i]);
 #endif
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	cm->cm_data = NULL;
 	cm->cm_complete = mps_reregister_events_complete;
 
 	error = mps_map_command(sc, cm);
 
 	mps_dprint(sc, MPS_TRACE, "%s finished with error %d\n", __func__,
 	    error);
 	return (error);
 }
 
 void
 mps_deregister_events(struct mps_softc *sc, struct mps_event_handle *handle)
 {
 
 	TAILQ_REMOVE(&sc->event_list, handle, eh_list);
 	free(handle, M_MPT2);
 }
 
 /*
  * Add a chain element as the next SGE for the specified command.
  * Reset cm_sge and cm_sgesize to indicate all the available space.
  */
 static int
 mps_add_chain(struct mps_command *cm)
 {
 	MPI2_SGE_CHAIN32 *sgc;
 	struct mps_chain *chain;
 	int space;
 
 	if (cm->cm_sglsize < MPS_SGC_SIZE)
 		panic("MPS: Need SGE Error Code\n");
 
 	chain = mps_alloc_chain(cm->cm_sc);
 	if (chain == NULL)
 		return (ENOBUFS);
 
 	space = (int)cm->cm_sc->facts->IOCRequestFrameSize * 4;
 
 	/*
 	 * Note: a double-linked list is used to make it easier to
 	 * walk for debugging.
 	 */
 	TAILQ_INSERT_TAIL(&cm->cm_chain_list, chain, chain_link);
 
 	sgc = (MPI2_SGE_CHAIN32 *)&cm->cm_sge->MpiChain;
 	sgc->Length = htole16(space);
 	sgc->NextChainOffset = 0;
 	/* TODO Looks like bug in Setting sgc->Flags. 
 	 *	sgc->Flags = ( MPI2_SGE_FLAGS_CHAIN_ELEMENT | MPI2_SGE_FLAGS_64_BIT_ADDRESSING |
 	 *	            MPI2_SGE_FLAGS_SYSTEM_ADDRESS) << MPI2_SGE_FLAGS_SHIFT
 	 *	This is fine.. because we are not using simple element. In case of 
 	 *	MPI2_SGE_CHAIN32, we have seperate Length and Flags feild.
  	 */
 	sgc->Flags = MPI2_SGE_FLAGS_CHAIN_ELEMENT;
 	sgc->Address = htole32(chain->chain_busaddr);
 
 	cm->cm_sge = (MPI2_SGE_IO_UNION *)&chain->chain->MpiSimple;
 	cm->cm_sglsize = space;
 	return (0);
 }
 
 /*
  * Add one scatter-gather element (chain, simple, transaction context)
  * to the scatter-gather list for a command.  Maintain cm_sglsize and
  * cm_sge as the remaining size and pointer to the next SGE to fill
  * in, respectively.
  */
 int
 mps_push_sge(struct mps_command *cm, void *sgep, size_t len, int segsleft)
 {
 	MPI2_SGE_TRANSACTION_UNION *tc = sgep;
 	MPI2_SGE_SIMPLE64 *sge = sgep;
 	int error, type;
 	uint32_t saved_buf_len, saved_address_low, saved_address_high;
 
 	type = (tc->Flags & MPI2_SGE_FLAGS_ELEMENT_MASK);
 
 #ifdef INVARIANTS
 	switch (type) {
 	case MPI2_SGE_FLAGS_TRANSACTION_ELEMENT: {
 		if (len != tc->DetailsLength + 4)
 			panic("TC %p length %u or %zu?", tc,
 			    tc->DetailsLength + 4, len);
 		}
 		break;
 	case MPI2_SGE_FLAGS_CHAIN_ELEMENT:
 		/* Driver only uses 32-bit chain elements */
 		if (len != MPS_SGC_SIZE)
 			panic("CHAIN %p length %u or %zu?", sgep,
 			    MPS_SGC_SIZE, len);
 		break;
 	case MPI2_SGE_FLAGS_SIMPLE_ELEMENT:
 		/* Driver only uses 64-bit SGE simple elements */
 		if (len != MPS_SGE64_SIZE)
 			panic("SGE simple %p length %u or %zu?", sge,
 			    MPS_SGE64_SIZE, len);
 		if (((le32toh(sge->FlagsLength) >> MPI2_SGE_FLAGS_SHIFT) &
 		    MPI2_SGE_FLAGS_ADDRESS_SIZE) == 0)
 			panic("SGE simple %p not marked 64-bit?", sge);
 
 		break;
 	default:
 		panic("Unexpected SGE %p, flags %02x", tc, tc->Flags);
 	}
 #endif
 
 	/*
 	 * case 1: 1 more segment, enough room for it
 	 * case 2: 2 more segments, enough room for both
 	 * case 3: >=2 more segments, only enough room for 1 and a chain
 	 * case 4: >=1 more segment, enough room for only a chain
 	 * case 5: >=1 more segment, no room for anything (error)
          */
 
 	/*
 	 * There should be room for at least a chain element, or this
 	 * code is buggy.  Case (5).
 	 */
 	if (cm->cm_sglsize < MPS_SGC_SIZE)
 		panic("MPS: Need SGE Error Code\n");
 
 	if (segsleft >= 2 &&
 	    cm->cm_sglsize < len + MPS_SGC_SIZE + MPS_SGE64_SIZE) {
 		/*
 		 * There are 2 or more segments left to add, and only
 		 * enough room for 1 and a chain.  Case (3).
 		 *
 		 * Mark as last element in this chain if necessary.
 		 */
 		if (type == MPI2_SGE_FLAGS_SIMPLE_ELEMENT) {
 			sge->FlagsLength |= htole32(
 			    MPI2_SGE_FLAGS_LAST_ELEMENT << MPI2_SGE_FLAGS_SHIFT);
 		}
 
 		/*
 		 * Add the item then a chain.  Do the chain now,
 		 * rather than on the next iteration, to simplify
 		 * understanding the code.
 		 */
 		cm->cm_sglsize -= len;
 		bcopy(sgep, cm->cm_sge, len);
 		cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
 		return (mps_add_chain(cm));
 	}
 
 	if (segsleft >= 1 && cm->cm_sglsize < len + MPS_SGC_SIZE) {
 		/*
 		 * 1 or more segment, enough room for only a chain.
 		 * Hope the previous element wasn't a Simple entry
 		 * that needed to be marked with
 		 * MPI2_SGE_FLAGS_LAST_ELEMENT.  Case (4).
 		 */
 		if ((error = mps_add_chain(cm)) != 0)
 			return (error);
 	}
 
 #ifdef INVARIANTS
 	/* Case 1: 1 more segment, enough room for it. */
 	if (segsleft == 1 && cm->cm_sglsize < len)
 		panic("1 seg left and no room? %u versus %zu",
 		    cm->cm_sglsize, len);
 
 	/* Case 2: 2 more segments, enough room for both */
 	if (segsleft == 2 && cm->cm_sglsize < len + MPS_SGE64_SIZE)
 		panic("2 segs left and no room? %u versus %zu",
 		    cm->cm_sglsize, len);
 #endif
 
 	if (segsleft == 1 && type == MPI2_SGE_FLAGS_SIMPLE_ELEMENT) {
 		/*
 		 * If this is a bi-directional request, need to account for that
 		 * here.  Save the pre-filled sge values.  These will be used
 		 * either for the 2nd SGL or for a single direction SGL.  If
 		 * cm_out_len is non-zero, this is a bi-directional request, so
 		 * fill in the OUT SGL first, then the IN SGL, otherwise just
 		 * fill in the IN SGL.  Note that at this time, when filling in
 		 * 2 SGL's for a bi-directional request, they both use the same
 		 * DMA buffer (same cm command).
 		 */
 		saved_buf_len = le32toh(sge->FlagsLength) & 0x00FFFFFF;
 		saved_address_low = sge->Address.Low;
 		saved_address_high = sge->Address.High;
 		if (cm->cm_out_len) {
 			sge->FlagsLength = htole32(cm->cm_out_len |
 			    ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 			    MPI2_SGE_FLAGS_END_OF_BUFFER |
 			    MPI2_SGE_FLAGS_HOST_TO_IOC |
 			    MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
 			    MPI2_SGE_FLAGS_SHIFT));
 			cm->cm_sglsize -= len;
 			bcopy(sgep, cm->cm_sge, len);
 			cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge
 			    + len);
 		}
 		saved_buf_len |=
 		    ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 		    MPI2_SGE_FLAGS_END_OF_BUFFER |
 		    MPI2_SGE_FLAGS_LAST_ELEMENT |
 		    MPI2_SGE_FLAGS_END_OF_LIST |
 		    MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
 		    MPI2_SGE_FLAGS_SHIFT);
 		if (cm->cm_flags & MPS_CM_FLAGS_DATAIN) {
 			saved_buf_len |=
 			    ((uint32_t)(MPI2_SGE_FLAGS_IOC_TO_HOST) <<
 			    MPI2_SGE_FLAGS_SHIFT);
 		} else {
 			saved_buf_len |=
 			    ((uint32_t)(MPI2_SGE_FLAGS_HOST_TO_IOC) <<
 			    MPI2_SGE_FLAGS_SHIFT);
 		}
 		sge->FlagsLength = htole32(saved_buf_len);
 		sge->Address.Low = saved_address_low;
 		sge->Address.High = saved_address_high;
 	}
 
 	cm->cm_sglsize -= len;
 	bcopy(sgep, cm->cm_sge, len);
 	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
 	return (0);
 }
 
 /*
  * Add one dma segment to the scatter-gather list for a command.
  */
 int
 mps_add_dmaseg(struct mps_command *cm, vm_paddr_t pa, size_t len, u_int flags,
     int segsleft)
 {
 	MPI2_SGE_SIMPLE64 sge;
 
 	/*
 	 * This driver always uses 64-bit address elements for simplicity.
 	 */
 	bzero(&sge, sizeof(sge));
 	flags |= MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 	    MPI2_SGE_FLAGS_64_BIT_ADDRESSING;
 	sge.FlagsLength = htole32(len | (flags << MPI2_SGE_FLAGS_SHIFT));
 	mps_from_u64(pa, &sge.Address);
 
 	return (mps_push_sge(cm, &sge, sizeof sge, segsleft));
 }
 
 static void
 mps_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 {
 	struct mps_softc *sc;
 	struct mps_command *cm;
 	u_int i, dir, sflags;
 
 	cm = (struct mps_command *)arg;
 	sc = cm->cm_sc;
 
 	/*
 	 * In this case, just print out a warning and let the chip tell the
 	 * user they did the wrong thing.
 	 */
 	if ((cm->cm_max_segs != 0) && (nsegs > cm->cm_max_segs)) {
 		mps_dprint(sc, MPS_ERROR,
 			   "%s: warning: busdma returned %d segments, "
 			   "more than the %d allowed\n", __func__, nsegs,
 			   cm->cm_max_segs);
 	}
 
 	/*
 	 * Set up DMA direction flags.  Bi-directional requests are also handled
 	 * here.  In that case, both direction flags will be set.
 	 */
 	sflags = 0;
 	if (cm->cm_flags & MPS_CM_FLAGS_SMP_PASS) {
 		/*
 		 * We have to add a special case for SMP passthrough, there
 		 * is no easy way to generically handle it.  The first
 		 * S/G element is used for the command (therefore the
 		 * direction bit needs to be set).  The second one is used
 		 * for the reply.  We'll leave it to the caller to make
 		 * sure we only have two buffers.
 		 */
 		/*
 		 * Even though the busdma man page says it doesn't make
 		 * sense to have both direction flags, it does in this case.
 		 * We have one s/g element being accessed in each direction.
 		 */
 		dir = BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD;
 
 		/*
 		 * Set the direction flag on the first buffer in the SMP
 		 * passthrough request.  We'll clear it for the second one.
 		 */
 		sflags |= MPI2_SGE_FLAGS_DIRECTION |
 			  MPI2_SGE_FLAGS_END_OF_BUFFER;
 	} else if (cm->cm_flags & MPS_CM_FLAGS_DATAOUT) {
 		sflags |= MPI2_SGE_FLAGS_HOST_TO_IOC;
 		dir = BUS_DMASYNC_PREWRITE;
 	} else
 		dir = BUS_DMASYNC_PREREAD;
 
 	for (i = 0; i < nsegs; i++) {
 		if ((cm->cm_flags & MPS_CM_FLAGS_SMP_PASS) && (i != 0)) {
 			sflags &= ~MPI2_SGE_FLAGS_DIRECTION;
 		}
 		error = mps_add_dmaseg(cm, segs[i].ds_addr, segs[i].ds_len,
 		    sflags, nsegs - i);
 		if (error != 0) {
 			/* Resource shortage, roll back! */
 			if (ratecheck(&sc->lastfail, &mps_chainfail_interval))
 				mps_dprint(sc, MPS_INFO, "Out of chain frames, "
 				    "consider increasing hw.mps.max_chains.\n");
 			cm->cm_flags |= MPS_CM_FLAGS_CHAIN_FAILED;
 			mps_complete_command(sc, cm);
 			return;
 		}
 	}
 
 	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
 	mps_enqueue_request(sc, cm);
 
 	return;
 }
 
 static void
 mps_data_cb2(void *arg, bus_dma_segment_t *segs, int nsegs, bus_size_t mapsize,
 	     int error)
 {
 	mps_data_cb(arg, segs, nsegs, error);
 }
 
 /*
  * This is the routine to enqueue commands ansynchronously.
  * Note that the only error path here is from bus_dmamap_load(), which can
  * return EINPROGRESS if it is waiting for resources.  Other than this, it's
  * assumed that if you have a command in-hand, then you have enough credits
  * to use it.
  */
 int
 mps_map_command(struct mps_softc *sc, struct mps_command *cm)
 {
 	int error = 0;
 
 	if (cm->cm_flags & MPS_CM_FLAGS_USE_UIO) {
 		error = bus_dmamap_load_uio(sc->buffer_dmat, cm->cm_dmamap,
 		    &cm->cm_uio, mps_data_cb2, cm, 0);
 	} else if (cm->cm_flags & MPS_CM_FLAGS_USE_CCB) {
 		error = bus_dmamap_load_ccb(sc->buffer_dmat, cm->cm_dmamap,
 		    cm->cm_data, mps_data_cb, cm, 0);
 	} else if ((cm->cm_data != NULL) && (cm->cm_length != 0)) {
 		error = bus_dmamap_load(sc->buffer_dmat, cm->cm_dmamap,
 		    cm->cm_data, cm->cm_length, mps_data_cb, cm, 0);
 	} else {
 		/* Add a zero-length element as needed */
 		if (cm->cm_sge != NULL)
 			mps_add_dmaseg(cm, 0, 0, 0, 1);
 		mps_enqueue_request(sc, cm);	
 	}
 
 	return (error);
 }
 
 /*
  * This is the routine to enqueue commands synchronously.  An error of
  * EINPROGRESS from mps_map_command() is ignored since the command will
  * be executed and enqueued automatically.  Other errors come from msleep().
  */
 int
 mps_wait_command(struct mps_softc *sc, struct mps_command *cm, int timeout,
     int sleep_flag)
 {
 	int error, rc;
 	struct timeval cur_time, start_time;
 
 	if (sc->mps_flags & MPS_FLAGS_DIAGRESET) 
 		return  EBUSY;
 
 	cm->cm_complete = NULL;
 	cm->cm_flags |= (MPS_CM_FLAGS_WAKEUP + MPS_CM_FLAGS_POLLED);
 	error = mps_map_command(sc, cm);
 	if ((error != 0) && (error != EINPROGRESS))
 		return (error);
 
 	// Check for context and wait for 50 mSec at a time until time has
 	// expired or the command has finished.  If msleep can't be used, need
 	// to poll.
 	if (curthread->td_no_sleeping != 0)
 		sleep_flag = NO_SLEEP;
 	getmicrotime(&start_time);
 	if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP) {
 		error = msleep(cm, &sc->mps_mtx, 0, "mpswait", timeout*hz);
 	} else {
 		while ((cm->cm_flags & MPS_CM_FLAGS_COMPLETE) == 0) {
 			mps_intr_locked(sc);
 			if (sleep_flag == CAN_SLEEP)
 				pause("mpswait", hz/20);
 			else
 				DELAY(50000);
 		
 			getmicrotime(&cur_time);
 			if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
 				error = EWOULDBLOCK;
 				break;
 			}
 		}
 	}
 
 	if (error == EWOULDBLOCK) {
 		mps_dprint(sc, MPS_FAULT, "Calling Reinit from %s\n", __func__);
 		rc = mps_reinit(sc);
 		mps_dprint(sc, MPS_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
 		    "failed");
 		error = ETIMEDOUT;
 	}
 	return (error);
 }
 
 /*
  * This is the routine to enqueue a command synchonously and poll for
  * completion.  Its use should be rare.
  */
 int
 mps_request_polled(struct mps_softc *sc, struct mps_command *cm)
 {
 	int error, timeout = 0, rc;
 	struct timeval cur_time, start_time;
 
 	error = 0;
 
 	cm->cm_flags |= MPS_CM_FLAGS_POLLED;
 	cm->cm_complete = NULL;
 	mps_map_command(sc, cm);
 
 	getmicrotime(&start_time);
 	while ((cm->cm_flags & MPS_CM_FLAGS_COMPLETE) == 0) {
 		mps_intr_locked(sc);
 
 		if (mtx_owned(&sc->mps_mtx))
 			msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0,
 			    "mpspoll", hz/20);
 		else
 			pause("mpsdiag", hz/20);
 
 		/*
 		 * Check for real-time timeout and fail if more than 60 seconds.
 		 */
 		getmicrotime(&cur_time);
 		timeout = cur_time.tv_sec - start_time.tv_sec;
 		if (timeout > 60) {
 			mps_dprint(sc, MPS_FAULT, "polling failed\n");
 			error = ETIMEDOUT;
 			break;
 		}
 	}
 
 	if (error) {
 		mps_dprint(sc, MPS_FAULT, "Calling Reinit from %s\n", __func__);
 		rc = mps_reinit(sc);
 		mps_dprint(sc, MPS_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
 		    "failed");
 	}
 
 	return (error);
 }
 
 /*
  * The MPT driver had a verbose interface for config pages.  In this driver,
  * reduce it to much simplier terms, similar to the Linux driver.
  */
 int
 mps_read_config_page(struct mps_softc *sc, struct mps_config_params *params)
 {
 	MPI2_CONFIG_REQUEST *req;
 	struct mps_command *cm;
 	int error;
 
 	if (sc->mps_flags & MPS_FLAGS_BUSY) {
 		return (EBUSY);
 	}
 
 	cm = mps_alloc_command(sc);
 	if (cm == NULL) {
 		return (EBUSY);
 	}
 
 	req = (MPI2_CONFIG_REQUEST *)cm->cm_req;
 	req->Function = MPI2_FUNCTION_CONFIG;
 	req->Action = params->action;
 	req->SGLFlags = 0;
 	req->ChainOffset = 0;
 	req->PageAddress = params->page_address;
 	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
 		MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr;
 
 		hdr = &params->hdr.Ext;
 		req->ExtPageType = hdr->ExtPageType;
 		req->ExtPageLength = hdr->ExtPageLength;
 		req->Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
 		req->Header.PageLength = 0; /* Must be set to zero */
 		req->Header.PageNumber = hdr->PageNumber;
 		req->Header.PageVersion = hdr->PageVersion;
 	} else {
 		MPI2_CONFIG_PAGE_HEADER *hdr;
 
 		hdr = &params->hdr.Struct;
 		req->Header.PageType = hdr->PageType;
 		req->Header.PageNumber = hdr->PageNumber;
 		req->Header.PageLength = hdr->PageLength;
 		req->Header.PageVersion = hdr->PageVersion;
 	}
 
 	cm->cm_data = params->buffer;
 	cm->cm_length = params->length;
-	cm->cm_sge = &req->PageBufferSGE;
-	cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION);
-	cm->cm_flags = MPS_CM_FLAGS_SGE_SIMPLE | MPS_CM_FLAGS_DATAIN;
+	if (cm->cm_data != NULL) {
+		cm->cm_sge = &req->PageBufferSGE;
+		cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION);
+		cm->cm_flags = MPS_CM_FLAGS_SGE_SIMPLE | MPS_CM_FLAGS_DATAIN;
+	} else
+		cm->cm_sge = NULL;
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 
 	cm->cm_complete_data = params;
 	if (params->callback != NULL) {
 		cm->cm_complete = mps_config_complete;
 		return (mps_map_command(sc, cm));
 	} else {
 		error = mps_wait_command(sc, cm, 0, CAN_SLEEP);
 		if (error) {
 			mps_dprint(sc, MPS_FAULT,
 			    "Error %d reading config page\n", error);
 			mps_free_command(sc, cm);
 			return (error);
 		}
 		mps_config_complete(sc, cm);
 	}
 
 	return (0);
 }
 
 int
 mps_write_config_page(struct mps_softc *sc, struct mps_config_params *params)
 {
 	return (EINVAL);
 }
 
 static void
 mps_config_complete(struct mps_softc *sc, struct mps_command *cm)
 {
 	MPI2_CONFIG_REPLY *reply;
 	struct mps_config_params *params;
 
 	MPS_FUNCTRACE(sc);
 	params = cm->cm_complete_data;
 
 	if (cm->cm_data != NULL) {
 		bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap,
 		    BUS_DMASYNC_POSTREAD);
 		bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
 	}
 
 	/*
 	 * XXX KDM need to do more error recovery?  This results in the
 	 * device in question not getting probed.
 	 */
 	if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		params->status = MPI2_IOCSTATUS_BUSY;
 		goto done;
 	}
 
 	reply = (MPI2_CONFIG_REPLY *)cm->cm_reply;
 	if (reply == NULL) {
 		params->status = MPI2_IOCSTATUS_BUSY;
 		goto done;
 	}
 	params->status = reply->IOCStatus;
-	if (params->hdr.Ext.ExtPageType != 0) {
+	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
 		params->hdr.Ext.ExtPageType = reply->ExtPageType;
 		params->hdr.Ext.ExtPageLength = reply->ExtPageLength;
+		params->hdr.Ext.PageType = reply->Header.PageType;
+		params->hdr.Ext.PageNumber = reply->Header.PageNumber;
+		params->hdr.Ext.PageVersion = reply->Header.PageVersion;
 	} else {
 		params->hdr.Struct.PageType = reply->Header.PageType;
 		params->hdr.Struct.PageNumber = reply->Header.PageNumber;
 		params->hdr.Struct.PageLength = reply->Header.PageLength;
 		params->hdr.Struct.PageVersion = reply->Header.PageVersion;
 	}
 
 done:
 	mps_free_command(sc, cm);
 	if (params->callback != NULL)
 		params->callback(sc, params);
 
 	return;
 }
Index: projects/building-blocks/sys/dev/mps/mps_user.c
===================================================================
--- projects/building-blocks/sys/dev/mps/mps_user.c	(revision 277723)
+++ projects/building-blocks/sys/dev/mps/mps_user.c	(revision 277724)
@@ -1,2423 +1,2427 @@
 /*-
  * Copyright (c) 2008 Yahoo!, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * LSI MPT-Fusion Host Adapter FreeBSD userland interface
  */
 /*-
  * Copyright (c) 2011, 2012 LSI Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * LSI MPT-Fusion Host Adapter FreeBSD
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 /* TODO Move headers to mpsvar */
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/selinfo.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/malloc.h>
 #include <sys/uio.h>
 #include <sys/sysctl.h>
 #include <sys/ioccom.h>
 #include <sys/endian.h>
 #include <sys/queue.h>
 #include <sys/kthread.h>
 #include <sys/taskqueue.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/rman.h>
 
 #include <cam/cam.h>
 #include <cam/scsi/scsi_all.h>
 
 #include <dev/mps/mpi/mpi2_type.h>
 #include <dev/mps/mpi/mpi2.h>
 #include <dev/mps/mpi/mpi2_ioc.h>
 #include <dev/mps/mpi/mpi2_cnfg.h>
 #include <dev/mps/mpi/mpi2_init.h>
 #include <dev/mps/mpi/mpi2_tool.h>
 #include <dev/mps/mps_ioctl.h>
 #include <dev/mps/mpsvar.h>
 #include <dev/mps/mps_table.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 static d_open_t		mps_open;
 static d_close_t	mps_close;
 static d_ioctl_t	mps_ioctl_devsw;
 
 static struct cdevsw mps_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	0,
 	.d_open =	mps_open,
 	.d_close =	mps_close,
 	.d_ioctl =	mps_ioctl_devsw,
 	.d_name =	"mps",
 };
 
 typedef int (mps_user_f)(struct mps_command *, struct mps_usr_command *);
 static mps_user_f	mpi_pre_ioc_facts;
 static mps_user_f	mpi_pre_port_facts;
 static mps_user_f	mpi_pre_fw_download;
 static mps_user_f	mpi_pre_fw_upload;
 static mps_user_f	mpi_pre_sata_passthrough;
 static mps_user_f	mpi_pre_smp_passthrough;
 static mps_user_f	mpi_pre_config;
 static mps_user_f	mpi_pre_sas_io_unit_control;
 
 static int mps_user_read_cfg_header(struct mps_softc *,
 				    struct mps_cfg_page_req *);
 static int mps_user_read_cfg_page(struct mps_softc *,
 				  struct mps_cfg_page_req *, void *);
 static int mps_user_read_extcfg_header(struct mps_softc *,
 				     struct mps_ext_cfg_page_req *);
 static int mps_user_read_extcfg_page(struct mps_softc *,
 				     struct mps_ext_cfg_page_req *, void *);
 static int mps_user_write_cfg_page(struct mps_softc *,
 				   struct mps_cfg_page_req *, void *);
 static int mps_user_setup_request(struct mps_command *,
 				  struct mps_usr_command *);
 static int mps_user_command(struct mps_softc *, struct mps_usr_command *);
 
 static int mps_user_pass_thru(struct mps_softc *sc, mps_pass_thru_t *data);
 static void mps_user_get_adapter_data(struct mps_softc *sc,
     mps_adapter_data_t *data);
 static void mps_user_read_pci_info(struct mps_softc *sc,
     mps_pci_info_t *data);
 static uint8_t mps_get_fw_diag_buffer_number(struct mps_softc *sc,
     uint32_t unique_id);
 static int mps_post_fw_diag_buffer(struct mps_softc *sc,
     mps_fw_diagnostic_buffer_t *pBuffer, uint32_t *return_code);
 static int mps_release_fw_diag_buffer(struct mps_softc *sc,
     mps_fw_diagnostic_buffer_t *pBuffer, uint32_t *return_code,
     uint32_t diag_type);
 static int mps_diag_register(struct mps_softc *sc,
     mps_fw_diag_register_t *diag_register, uint32_t *return_code);
 static int mps_diag_unregister(struct mps_softc *sc,
     mps_fw_diag_unregister_t *diag_unregister, uint32_t *return_code);
 static int mps_diag_query(struct mps_softc *sc, mps_fw_diag_query_t *diag_query,
     uint32_t *return_code);
 static int mps_diag_read_buffer(struct mps_softc *sc,
     mps_diag_read_buffer_t *diag_read_buffer, uint8_t *ioctl_buf,
     uint32_t *return_code);
 static int mps_diag_release(struct mps_softc *sc,
     mps_fw_diag_release_t *diag_release, uint32_t *return_code);
 static int mps_do_diag_action(struct mps_softc *sc, uint32_t action,
     uint8_t *diag_action, uint32_t length, uint32_t *return_code);
 static int mps_user_diag_action(struct mps_softc *sc, mps_diag_action_t *data);
 static void mps_user_event_query(struct mps_softc *sc, mps_event_query_t *data);
 static void mps_user_event_enable(struct mps_softc *sc,
     mps_event_enable_t *data);
 static int mps_user_event_report(struct mps_softc *sc,
     mps_event_report_t *data);
 static int mps_user_reg_access(struct mps_softc *sc, mps_reg_access_t *data);
 static int mps_user_btdh(struct mps_softc *sc, mps_btdh_mapping_t *data);
 
 static MALLOC_DEFINE(M_MPSUSER, "mps_user", "Buffers for mps(4) ioctls");
 
 /* Macros from compat/freebsd32/freebsd32.h */
 #define	PTRIN(v)	(void *)(uintptr_t)(v)
 #define	PTROUT(v)	(uint32_t)(uintptr_t)(v)
 
 #define	CP(src,dst,fld) do { (dst).fld = (src).fld; } while (0)
 #define	PTRIN_CP(src,dst,fld)				\
 	do { (dst).fld = PTRIN((src).fld); } while (0)
 #define	PTROUT_CP(src,dst,fld) \
 	do { (dst).fld = PTROUT((src).fld); } while (0)
 
 int
 mps_attach_user(struct mps_softc *sc)
 {
 	int unit;
 
 	unit = device_get_unit(sc->mps_dev);
 	sc->mps_cdev = make_dev(&mps_cdevsw, unit, UID_ROOT, GID_OPERATOR, 0640,
 	    "mps%d", unit);
 	if (sc->mps_cdev == NULL) {
 		return (ENOMEM);
 	}
 	sc->mps_cdev->si_drv1 = sc;
 	return (0);
 }
 
 void
 mps_detach_user(struct mps_softc *sc)
 {
 
 	/* XXX: do a purge of pending requests? */
 	if (sc->mps_cdev != NULL)
 		destroy_dev(sc->mps_cdev);
 }
 
 static int
 mps_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 mps_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 mps_user_read_cfg_header(struct mps_softc *sc,
     struct mps_cfg_page_req *page_req)
 {
 	MPI2_CONFIG_PAGE_HEADER *hdr;
 	struct mps_config_params params;
 	int	    error;
 
 	hdr = &params.hdr.Struct;
 	params.action = MPI2_CONFIG_ACTION_PAGE_HEADER;
 	params.page_address = le32toh(page_req->page_address);
 	hdr->PageVersion = 0;
 	hdr->PageLength = 0;
 	hdr->PageNumber = page_req->header.PageNumber;
 	hdr->PageType = page_req->header.PageType;
 	params.buffer = NULL;
 	params.length = 0;
 	params.callback = NULL;
 
 	if ((error = mps_read_config_page(sc, &params)) != 0) {
 		/*
 		 * Leave the request. Without resetting the chip, it's
 		 * still owned by it and we'll just get into trouble
 		 * freeing it now. Mark it as abandoned so that if it
 		 * shows up later it can be freed.
 		 */
 		mps_printf(sc, "read_cfg_header timed out\n");
 		return (ETIMEDOUT);
 	}
 
 	page_req->ioc_status = htole16(params.status);
 	if ((page_req->ioc_status & MPI2_IOCSTATUS_MASK) ==
 	    MPI2_IOCSTATUS_SUCCESS) {
 		bcopy(hdr, &page_req->header, sizeof(page_req->header));
 	}
 
 	return (0);
 }
 
 static int
 mps_user_read_cfg_page(struct mps_softc *sc, struct mps_cfg_page_req *page_req,
     void *buf)
 {
 	MPI2_CONFIG_PAGE_HEADER *reqhdr, *hdr;
 	struct mps_config_params params;
 	int	      error;
 
 	reqhdr = buf;
 	hdr = &params.hdr.Struct;
 	hdr->PageVersion = reqhdr->PageVersion;
 	hdr->PageLength = reqhdr->PageLength;
 	hdr->PageNumber = reqhdr->PageNumber;
 	hdr->PageType = reqhdr->PageType & MPI2_CONFIG_PAGETYPE_MASK;
 	params.action = MPI2_CONFIG_ACTION_PAGE_READ_CURRENT;
 	params.page_address = le32toh(page_req->page_address);
 	params.buffer = buf;
 	params.length = le32toh(page_req->len);
 	params.callback = NULL;
 
 	if ((error = mps_read_config_page(sc, &params)) != 0) {
 		mps_printf(sc, "mps_user_read_cfg_page timed out\n");
 		return (ETIMEDOUT);
 	}
 
 	page_req->ioc_status = htole16(params.status);
 	return (0);
 }
 
 static int
 mps_user_read_extcfg_header(struct mps_softc *sc,
     struct mps_ext_cfg_page_req *ext_page_req)
 {
 	MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr;
 	struct mps_config_params params;
 	int	    error;
 
 	hdr = &params.hdr.Ext;
 	params.action = MPI2_CONFIG_ACTION_PAGE_HEADER;
 	hdr->PageVersion = ext_page_req->header.PageVersion;
 	hdr->PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
 	hdr->ExtPageLength = 0;
 	hdr->PageNumber = ext_page_req->header.PageNumber;
 	hdr->ExtPageType = ext_page_req->header.ExtPageType;
 	params.page_address = le32toh(ext_page_req->page_address);
+	params.buffer = NULL;
+	params.length = 0;
+	params.callback = NULL;
+
 	if ((error = mps_read_config_page(sc, &params)) != 0) {
 		/*
 		 * Leave the request. Without resetting the chip, it's
 		 * still owned by it and we'll just get into trouble
 		 * freeing it now. Mark it as abandoned so that if it
 		 * shows up later it can be freed.
 		 */
 		mps_printf(sc, "mps_user_read_extcfg_header timed out\n");
 		return (ETIMEDOUT);
 	}
 
 	ext_page_req->ioc_status = htole16(params.status);
 	if ((ext_page_req->ioc_status & MPI2_IOCSTATUS_MASK) ==
 	    MPI2_IOCSTATUS_SUCCESS) {
 		ext_page_req->header.PageVersion = hdr->PageVersion;
 		ext_page_req->header.PageNumber = hdr->PageNumber;
 		ext_page_req->header.PageType = hdr->PageType;
 		ext_page_req->header.ExtPageLength = hdr->ExtPageLength;
 		ext_page_req->header.ExtPageType = hdr->ExtPageType;
 	}
 
 	return (0);
 }
 
 static int
 mps_user_read_extcfg_page(struct mps_softc *sc,
     struct mps_ext_cfg_page_req *ext_page_req, void *buf)
 {
 	MPI2_CONFIG_EXTENDED_PAGE_HEADER *reqhdr, *hdr;
 	struct mps_config_params params;
 	int error;
 
 	reqhdr = buf;
 	hdr = &params.hdr.Ext;
 	params.action = MPI2_CONFIG_ACTION_PAGE_READ_CURRENT;
 	params.page_address = le32toh(ext_page_req->page_address);
 	hdr->PageVersion = reqhdr->PageVersion;
 	hdr->PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
 	hdr->PageNumber = reqhdr->PageNumber;
 	hdr->ExtPageType = reqhdr->ExtPageType;
 	hdr->ExtPageLength = reqhdr->ExtPageLength;
 	params.buffer = buf;
 	params.length = le32toh(ext_page_req->len);
 	params.callback = NULL;
 
 	if ((error = mps_read_config_page(sc, &params)) != 0) {
 		mps_printf(sc, "mps_user_read_extcfg_page timed out\n");
 		return (ETIMEDOUT);
 	}
 
 	ext_page_req->ioc_status = htole16(params.status);
 	return (0);
 }
 
 static int
 mps_user_write_cfg_page(struct mps_softc *sc,
     struct mps_cfg_page_req *page_req, void *buf)
 {
 	MPI2_CONFIG_PAGE_HEADER *reqhdr, *hdr;
 	struct mps_config_params params;
 	u_int	      hdr_attr;
 	int	      error;
 
 	reqhdr = buf;
 	hdr = &params.hdr.Struct;
 	hdr_attr = reqhdr->PageType & MPI2_CONFIG_PAGEATTR_MASK;
 	if (hdr_attr != MPI2_CONFIG_PAGEATTR_CHANGEABLE &&
 	    hdr_attr != MPI2_CONFIG_PAGEATTR_PERSISTENT) {
 		mps_printf(sc, "page type 0x%x not changeable\n",
 			reqhdr->PageType & MPI2_CONFIG_PAGETYPE_MASK);
 		return (EINVAL);
 	}
 
 	/*
 	 * There isn't any point in restoring stripped out attributes
 	 * if you then mask them going down to issue the request.
 	 */
 
 	hdr->PageVersion = reqhdr->PageVersion;
 	hdr->PageLength = reqhdr->PageLength;
 	hdr->PageNumber = reqhdr->PageNumber;
 	hdr->PageType = reqhdr->PageType;
 	params.action = MPI2_CONFIG_ACTION_PAGE_WRITE_CURRENT;
 	params.page_address = le32toh(page_req->page_address);
 	params.buffer = buf;
 	params.length = le32toh(page_req->len);
 	params.callback = NULL;
 
 	if ((error = mps_write_config_page(sc, &params)) != 0) {
 		mps_printf(sc, "mps_write_cfg_page timed out\n");
 		return (ETIMEDOUT);
 	}
 
 	page_req->ioc_status = htole16(params.status);
 	return (0);
 }
 
 void
 mpi_init_sge(struct mps_command *cm, void *req, void *sge)
 {
 	int off, space;
 
 	space = (int)cm->cm_sc->facts->IOCRequestFrameSize * 4;
 	off = (uintptr_t)sge - (uintptr_t)req;
 
 	KASSERT(off < space, ("bad pointers %p %p, off %d, space %d",
             req, sge, off, space));
 
 	cm->cm_sge = sge;
 	cm->cm_sglsize = space - off;
 }
 
 /*
  * Prepare the mps_command for an IOC_FACTS request.
  */
 static int
 mpi_pre_ioc_facts(struct mps_command *cm, struct mps_usr_command *cmd)
 {
 	MPI2_IOC_FACTS_REQUEST *req = (void *)cm->cm_req;
 	MPI2_IOC_FACTS_REPLY *rpl;
 
 	if (cmd->req_len != sizeof *req)
 		return (EINVAL);
 	if (cmd->rpl_len != sizeof *rpl)
 		return (EINVAL);
 
 	cm->cm_sge = NULL;
 	cm->cm_sglsize = 0;
 	return (0);
 }
 
 /*
  * Prepare the mps_command for a PORT_FACTS request.
  */
 static int
 mpi_pre_port_facts(struct mps_command *cm, struct mps_usr_command *cmd)
 {
 	MPI2_PORT_FACTS_REQUEST *req = (void *)cm->cm_req;
 	MPI2_PORT_FACTS_REPLY *rpl;
 
 	if (cmd->req_len != sizeof *req)
 		return (EINVAL);
 	if (cmd->rpl_len != sizeof *rpl)
 		return (EINVAL);
 
 	cm->cm_sge = NULL;
 	cm->cm_sglsize = 0;
 	return (0);
 }
 
 /*
  * Prepare the mps_command for a FW_DOWNLOAD request.
  */
 static int
 mpi_pre_fw_download(struct mps_command *cm, struct mps_usr_command *cmd)
 {
 	MPI2_FW_DOWNLOAD_REQUEST *req = (void *)cm->cm_req;
 	MPI2_FW_DOWNLOAD_REPLY *rpl;
 	MPI2_FW_DOWNLOAD_TCSGE tc;
 	int error;
 
 	/*
 	 * This code assumes there is room in the request's SGL for
 	 * the TransactionContext plus at least a SGL chain element.
 	 */
 	CTASSERT(sizeof req->SGL >= sizeof tc + MPS_SGC_SIZE);
 
 	if (cmd->req_len != sizeof *req)
 		return (EINVAL);
 	if (cmd->rpl_len != sizeof *rpl)
 		return (EINVAL);
 
 	if (cmd->len == 0)
 		return (EINVAL);
 
 	error = copyin(cmd->buf, cm->cm_data, cmd->len);
 	if (error != 0)
 		return (error);
 
 	mpi_init_sge(cm, req, &req->SGL);
 	bzero(&tc, sizeof tc);
 
 	/*
 	 * For now, the F/W image must be provided in a single request.
 	 */
 	if ((req->MsgFlags & MPI2_FW_DOWNLOAD_MSGFLGS_LAST_SEGMENT) == 0)
 		return (EINVAL);
 	if (req->TotalImageSize != cmd->len)
 		return (EINVAL);
 
 	/*
 	 * The value of the first two elements is specified in the
 	 * Fusion-MPT Message Passing Interface document.
 	 */
 	tc.ContextSize = 0;
 	tc.DetailsLength = 12;
 	tc.ImageOffset = 0;
 	tc.ImageSize = cmd->len;
 
 	cm->cm_flags |= MPS_CM_FLAGS_DATAOUT;
 
 	return (mps_push_sge(cm, &tc, sizeof tc, 0));
 }
 
 /*
  * Prepare the mps_command for a FW_UPLOAD request.
  */
 static int
 mpi_pre_fw_upload(struct mps_command *cm, struct mps_usr_command *cmd)
 {
 	MPI2_FW_UPLOAD_REQUEST *req = (void *)cm->cm_req;
 	MPI2_FW_UPLOAD_REPLY *rpl;
 	MPI2_FW_UPLOAD_TCSGE tc;
 
 	/*
 	 * This code assumes there is room in the request's SGL for
 	 * the TransactionContext plus at least a SGL chain element.
 	 */
 	CTASSERT(sizeof req->SGL >= sizeof tc + MPS_SGC_SIZE);
 
 	if (cmd->req_len != sizeof *req)
 		return (EINVAL);
 	if (cmd->rpl_len != sizeof *rpl)
 		return (EINVAL);
 
 	mpi_init_sge(cm, req, &req->SGL);
 	bzero(&tc, sizeof tc);
 
 	/*
 	 * The value of the first two elements is specified in the
 	 * Fusion-MPT Message Passing Interface document.
 	 */
 	tc.ContextSize = 0;
 	tc.DetailsLength = 12;
 	/*
 	 * XXX Is there any reason to fetch a partial image?  I.e. to
 	 * set ImageOffset to something other than 0?
 	 */
 	tc.ImageOffset = 0;
 	tc.ImageSize = cmd->len;
 
 	cm->cm_flags |= MPS_CM_FLAGS_DATAIN;
 
 	return (mps_push_sge(cm, &tc, sizeof tc, 0));
 }
 
 /*
  * Prepare the mps_command for a SATA_PASSTHROUGH request.
  */
 static int
 mpi_pre_sata_passthrough(struct mps_command *cm, struct mps_usr_command *cmd)
 {
 	MPI2_SATA_PASSTHROUGH_REQUEST *req = (void *)cm->cm_req;
 	MPI2_SATA_PASSTHROUGH_REPLY *rpl;
 
 	if (cmd->req_len != sizeof *req)
 		return (EINVAL);
 	if (cmd->rpl_len != sizeof *rpl)
 		return (EINVAL);
 
 	mpi_init_sge(cm, req, &req->SGL);
 	return (0);
 }
 
 /*
  * Prepare the mps_command for a SMP_PASSTHROUGH request.
  */
 static int
 mpi_pre_smp_passthrough(struct mps_command *cm, struct mps_usr_command *cmd)
 {
 	MPI2_SMP_PASSTHROUGH_REQUEST *req = (void *)cm->cm_req;
 	MPI2_SMP_PASSTHROUGH_REPLY *rpl;
 
 	if (cmd->req_len != sizeof *req)
 		return (EINVAL);
 	if (cmd->rpl_len != sizeof *rpl)
 		return (EINVAL);
 
 	mpi_init_sge(cm, req, &req->SGL);
 	return (0);
 }
 
 /*
  * Prepare the mps_command for a CONFIG request.
  */
 static int
 mpi_pre_config(struct mps_command *cm, struct mps_usr_command *cmd)
 {
 	MPI2_CONFIG_REQUEST *req = (void *)cm->cm_req;
 	MPI2_CONFIG_REPLY *rpl;
 
 	if (cmd->req_len != sizeof *req)
 		return (EINVAL);
 	if (cmd->rpl_len != sizeof *rpl)
 		return (EINVAL);
 
 	mpi_init_sge(cm, req, &req->PageBufferSGE);
 	return (0);
 }
 
 /*
  * Prepare the mps_command for a SAS_IO_UNIT_CONTROL request.
  */
 static int
 mpi_pre_sas_io_unit_control(struct mps_command *cm,
 			     struct mps_usr_command *cmd)
 {
 
 	cm->cm_sge = NULL;
 	cm->cm_sglsize = 0;
 	return (0);
 }
 
 /*
  * A set of functions to prepare an mps_command for the various
  * supported requests.
  */
 struct mps_user_func {
 	U8		Function;
 	mps_user_f	*f_pre;
 } mps_user_func_list[] = {
 	{ MPI2_FUNCTION_IOC_FACTS,		mpi_pre_ioc_facts },
 	{ MPI2_FUNCTION_PORT_FACTS,		mpi_pre_port_facts },
 	{ MPI2_FUNCTION_FW_DOWNLOAD, 		mpi_pre_fw_download },
 	{ MPI2_FUNCTION_FW_UPLOAD,		mpi_pre_fw_upload },
 	{ MPI2_FUNCTION_SATA_PASSTHROUGH,	mpi_pre_sata_passthrough },
 	{ MPI2_FUNCTION_SMP_PASSTHROUGH,	mpi_pre_smp_passthrough},
 	{ MPI2_FUNCTION_CONFIG,			mpi_pre_config},
 	{ MPI2_FUNCTION_SAS_IO_UNIT_CONTROL,	mpi_pre_sas_io_unit_control },
 	{ 0xFF,					NULL } /* list end */
 };
 
 static int
 mps_user_setup_request(struct mps_command *cm, struct mps_usr_command *cmd)
 {
 	MPI2_REQUEST_HEADER *hdr = (MPI2_REQUEST_HEADER *)cm->cm_req;	
 	struct mps_user_func *f;
 
 	for (f = mps_user_func_list; f->f_pre != NULL; f++) {
 		if (hdr->Function == f->Function)
 			return (f->f_pre(cm, cmd));
 	}
 	return (EINVAL);
 }	
 
 static int
 mps_user_command(struct mps_softc *sc, struct mps_usr_command *cmd)
 {
 	MPI2_REQUEST_HEADER *hdr;	
 	MPI2_DEFAULT_REPLY *rpl;
 	void *buf = NULL;
 	struct mps_command *cm = NULL;
 	int err = 0;
 	int sz;
 
 	mps_lock(sc);
 	cm = mps_alloc_command(sc);
 
 	if (cm == NULL) {
 		mps_printf(sc, "%s: no mps requests\n", __func__);
 		err = ENOMEM;
 		goto Ret;
 	}
 	mps_unlock(sc);
 
 	hdr = (MPI2_REQUEST_HEADER *)cm->cm_req;
 
 	mps_dprint(sc, MPS_USER, "%s: req %p %d  rpl %p %d\n", __func__,
 	    cmd->req, cmd->req_len, cmd->rpl, cmd->rpl_len);
 
 	if (cmd->req_len > (int)sc->facts->IOCRequestFrameSize * 4) {
 		err = EINVAL;
 		goto RetFreeUnlocked;
 	}
 	err = copyin(cmd->req, hdr, cmd->req_len);
 	if (err != 0)
 		goto RetFreeUnlocked;
 
 	mps_dprint(sc, MPS_USER, "%s: Function %02X MsgFlags %02X\n", __func__,
 	    hdr->Function, hdr->MsgFlags);
 
 	if (cmd->len > 0) {
 		buf = malloc(cmd->len, M_MPSUSER, M_WAITOK|M_ZERO);
 		if(!buf) {
 			mps_printf(sc, "Cannot allocate memory %s %d\n",
 			 __func__, __LINE__);
 			return (ENOMEM);
     	}
 		cm->cm_data = buf;
 		cm->cm_length = cmd->len;
 	} else {
 		cm->cm_data = NULL;
 		cm->cm_length = 0;
 	}
 
 	cm->cm_flags = MPS_CM_FLAGS_SGE_SIMPLE;
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 
 	err = mps_user_setup_request(cm, cmd);
 	if (err == EINVAL) {
 		mps_printf(sc, "%s: unsupported parameter or unsupported "
 		    "function in request (function = 0x%X)\n", __func__,
 		    hdr->Function);
 	}
 	if (err != 0)
 		goto RetFreeUnlocked;
 
 	mps_lock(sc);
 	err = mps_wait_command(sc, cm, 60, CAN_SLEEP);
 
 	if (err) {
 		mps_printf(sc, "%s: invalid request: error %d\n",
 		    __func__, err);
 		goto Ret;
 	}
 
 	rpl = (MPI2_DEFAULT_REPLY *)cm->cm_reply;
 	if (rpl != NULL)
 		sz = rpl->MsgLength * 4;
 	else
 		sz = 0;
 	
 	if (sz > cmd->rpl_len) {
 		mps_printf(sc, "%s: user reply buffer (%d) smaller than "
 		    "returned buffer (%d)\n", __func__, cmd->rpl_len, sz);
 		sz = cmd->rpl_len;
 	}	
 
 	mps_unlock(sc);
 	copyout(rpl, cmd->rpl, sz);
 	if (buf != NULL)
 		copyout(buf, cmd->buf, cmd->len);
 	mps_dprint(sc, MPS_USER, "%s: reply size %d\n", __func__, sz);
 
 RetFreeUnlocked:
 	mps_lock(sc);
 	if (cm != NULL)
 		mps_free_command(sc, cm);
 Ret:
 	mps_unlock(sc);
 	if (buf != NULL)
 		free(buf, M_MPSUSER);
 	return (err);
 }
 
 static int
 mps_user_pass_thru(struct mps_softc *sc, mps_pass_thru_t *data)
 {
 	MPI2_REQUEST_HEADER	*hdr, tmphdr;	
 	MPI2_DEFAULT_REPLY	*rpl;
 	struct mps_command	*cm = NULL;
 	int			err = 0, dir = 0, sz;
 	uint8_t			function = 0;
 	u_int			sense_len;
 
 	/*
 	 * Only allow one passthru command at a time.  Use the MPS_FLAGS_BUSY
 	 * bit to denote that a passthru is being processed.
 	 */
 	mps_lock(sc);
 	if (sc->mps_flags & MPS_FLAGS_BUSY) {
 		mps_dprint(sc, MPS_USER, "%s: Only one passthru command "
 		    "allowed at a single time.", __func__);
 		mps_unlock(sc);
 		return (EBUSY);
 	}
 	sc->mps_flags |= MPS_FLAGS_BUSY;
 	mps_unlock(sc);
 
 	/*
 	 * Do some validation on data direction.  Valid cases are:
 	 *    1) DataSize is 0 and direction is NONE
 	 *    2) DataSize is non-zero and one of:
 	 *        a) direction is READ or
 	 *        b) direction is WRITE or
 	 *        c) direction is BOTH and DataOutSize is non-zero
 	 * If valid and the direction is BOTH, change the direction to READ.
 	 * if valid and the direction is not BOTH, make sure DataOutSize is 0.
 	 */
 	if (((data->DataSize == 0) &&
 	    (data->DataDirection == MPS_PASS_THRU_DIRECTION_NONE)) ||
 	    ((data->DataSize != 0) &&
 	    ((data->DataDirection == MPS_PASS_THRU_DIRECTION_READ) ||
 	    (data->DataDirection == MPS_PASS_THRU_DIRECTION_WRITE) ||
 	    ((data->DataDirection == MPS_PASS_THRU_DIRECTION_BOTH) &&
 	    (data->DataOutSize != 0))))) {
 		if (data->DataDirection == MPS_PASS_THRU_DIRECTION_BOTH)
 			data->DataDirection = MPS_PASS_THRU_DIRECTION_READ;
 		else
 			data->DataOutSize = 0;
 	} else
 		return (EINVAL);
 
 	mps_dprint(sc, MPS_USER, "%s: req 0x%jx %d  rpl 0x%jx %d "
 	    "data in 0x%jx %d data out 0x%jx %d data dir %d\n", __func__,
 	    data->PtrRequest, data->RequestSize, data->PtrReply,
 	    data->ReplySize, data->PtrData, data->DataSize,
 	    data->PtrDataOut, data->DataOutSize, data->DataDirection);
 
 	/*
 	 * copy in the header so we know what we're dealing with before we
 	 * commit to allocating a command for it.
 	 */
 	err = copyin(PTRIN(data->PtrRequest), &tmphdr, data->RequestSize);
 	if (err != 0)
 		goto RetFreeUnlocked;
 
 	if (data->RequestSize > (int)sc->facts->IOCRequestFrameSize * 4) {
 		err = EINVAL;
 		goto RetFreeUnlocked;
 	}
 
 	function = tmphdr.Function;
 	mps_dprint(sc, MPS_USER, "%s: Function %02X MsgFlags %02X\n", __func__,
 	    function, tmphdr.MsgFlags);
 
 	/*
 	 * Handle a passthru TM request.
 	 */
 	if (function == MPI2_FUNCTION_SCSI_TASK_MGMT) {
 		MPI2_SCSI_TASK_MANAGE_REQUEST	*task;
 
 		mps_lock(sc);
 		cm = mpssas_alloc_tm(sc);
 		if (cm == NULL) {
 			err = EINVAL;
 			goto Ret;
 		}
 
 		/* Copy the header in.  Only a small fixup is needed. */
 		task = (MPI2_SCSI_TASK_MANAGE_REQUEST *)cm->cm_req;
 		bcopy(&tmphdr, task, data->RequestSize);
 		task->TaskMID = cm->cm_desc.Default.SMID;
 
 		cm->cm_data = NULL;
 		cm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY;
 		cm->cm_complete = NULL;
 		cm->cm_complete_data = NULL;
 
 		err = mps_wait_command(sc, cm, 30, CAN_SLEEP);
 
 		if (err != 0) {
 			err = EIO;
 			mps_dprint(sc, MPS_FAULT, "%s: task management failed",
 			    __func__);
 		}
 		/*
 		 * Copy the reply data and sense data to user space.
 		 */
 		if (cm->cm_reply != NULL) {
 			rpl = (MPI2_DEFAULT_REPLY *)cm->cm_reply;
 			sz = rpl->MsgLength * 4;
 	
 			if (sz > data->ReplySize) {
 				mps_printf(sc, "%s: user reply buffer (%d) "
 				    "smaller than returned buffer (%d)\n",
 				    __func__, data->ReplySize, sz);
 			}
 			mps_unlock(sc);
 			copyout(cm->cm_reply, PTRIN(data->PtrReply),
 			    data->ReplySize);
 			mps_lock(sc);
 		}
 		mpssas_free_tm(sc, cm);
 		goto Ret;
 	}
 
 	mps_lock(sc);
 	cm = mps_alloc_command(sc);
 
 	if (cm == NULL) {
 		mps_printf(sc, "%s: no mps requests\n", __func__);
 		err = ENOMEM;
 		goto Ret;
 	}
 	mps_unlock(sc);
 
 	hdr = (MPI2_REQUEST_HEADER *)cm->cm_req;
 	bcopy(&tmphdr, hdr, data->RequestSize);
 
 	/*
 	 * Do some checking to make sure the IOCTL request contains a valid
 	 * request.  Then set the SGL info.
 	 */
 	mpi_init_sge(cm, hdr, (void *)((uint8_t *)hdr + data->RequestSize));
 
 	/*
 	 * Set up for read, write or both.  From check above, DataOutSize will
 	 * be 0 if direction is READ or WRITE, but it will have some non-zero
 	 * value if the direction is BOTH.  So, just use the biggest size to get
 	 * the cm_data buffer size.  If direction is BOTH, 2 SGLs need to be set
 	 * up; the first is for the request and the second will contain the
 	 * response data. cm_out_len needs to be set here and this will be used
 	 * when the SGLs are set up.
 	 */
 	cm->cm_data = NULL;
 	cm->cm_length = MAX(data->DataSize, data->DataOutSize);
 	cm->cm_out_len = data->DataOutSize;
 	cm->cm_flags = 0;
 	if (cm->cm_length != 0) {
 		cm->cm_data = malloc(cm->cm_length, M_MPSUSER, M_WAITOK |
 		    M_ZERO);
 		if (cm->cm_data == NULL) {
 			mps_dprint(sc, MPS_FAULT, "%s: alloc failed for IOCTL "
 			    "passthru length %d\n", __func__, cm->cm_length);
 		} else {
 			cm->cm_flags = MPS_CM_FLAGS_DATAIN;
 			if (data->DataOutSize) {
 				cm->cm_flags |= MPS_CM_FLAGS_DATAOUT;
 				err = copyin(PTRIN(data->PtrDataOut),
 				    cm->cm_data, data->DataOutSize);
 			} else if (data->DataDirection ==
 			    MPS_PASS_THRU_DIRECTION_WRITE) {
 				cm->cm_flags = MPS_CM_FLAGS_DATAOUT;
 				err = copyin(PTRIN(data->PtrData),
 				    cm->cm_data, data->DataSize);
 			}
 			if (err != 0)
 				mps_dprint(sc, MPS_FAULT, "%s: failed to copy "
 				    "IOCTL data from user space\n", __func__);
 		}
 	}
 	cm->cm_flags |= MPS_CM_FLAGS_SGE_SIMPLE;
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 
 	/*
 	 * Set up Sense buffer and SGL offset for IO passthru.  SCSI IO request
 	 * uses SCSI IO descriptor.
 	 */
 	if ((function == MPI2_FUNCTION_SCSI_IO_REQUEST) ||
 	    (function == MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH)) {
 		MPI2_SCSI_IO_REQUEST	*scsi_io_req;
 
 		scsi_io_req = (MPI2_SCSI_IO_REQUEST *)hdr;
 		/*
 		 * Put SGE for data and data_out buffer at the end of
 		 * scsi_io_request message header (64 bytes in total).
 		 * Following above SGEs, the residual space will be used by
 		 * sense data.
 		 */
 		scsi_io_req->SenseBufferLength = (uint8_t)(data->RequestSize -
 		    64);
 		scsi_io_req->SenseBufferLowAddress = htole32(cm->cm_sense_busaddr);
 
 		/*
 		 * Set SGLOffset0 value.  This is the number of dwords that SGL
 		 * is offset from the beginning of MPI2_SCSI_IO_REQUEST struct.
 		 */
 		scsi_io_req->SGLOffset0 = 24;
 
 		/*
 		 * Setup descriptor info.  RAID passthrough must use the
 		 * default request descriptor which is already set, so if this
 		 * is a SCSI IO request, change the descriptor to SCSI IO.
 		 * Also, if this is a SCSI IO request, handle the reply in the
 		 * mpssas_scsio_complete function.
 		 */
 		if (function == MPI2_FUNCTION_SCSI_IO_REQUEST) {
 			cm->cm_desc.SCSIIO.RequestFlags =
 			    MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO;
 			cm->cm_desc.SCSIIO.DevHandle = scsi_io_req->DevHandle;
 
 			/*
 			 * Make sure the DevHandle is not 0 because this is a
 			 * likely error.
 			 */
 			if (scsi_io_req->DevHandle == 0) {
 				err = EINVAL;
 				goto RetFreeUnlocked;
 			}
 		}
 	}
 
 	mps_lock(sc);
 
 	err = mps_wait_command(sc, cm, 30, CAN_SLEEP);
 
 	if (err) {
 		mps_printf(sc, "%s: invalid request: error %d\n", __func__,
 		    err);
 		mps_unlock(sc);
 		goto RetFreeUnlocked;
 	}
 
 	/*
 	 * Sync the DMA data, if any.  Then copy the data to user space.
 	 */
 	if (cm->cm_data != NULL) {
 		if (cm->cm_flags & MPS_CM_FLAGS_DATAIN)
 			dir = BUS_DMASYNC_POSTREAD;
 		else if (cm->cm_flags & MPS_CM_FLAGS_DATAOUT)
 			dir = BUS_DMASYNC_POSTWRITE;
 		bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
 		bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
 
 		if (cm->cm_flags & MPS_CM_FLAGS_DATAIN) {
 			mps_unlock(sc);
 			err = copyout(cm->cm_data,
 			    PTRIN(data->PtrData), data->DataSize);
 			mps_lock(sc);
 			if (err != 0)
 				mps_dprint(sc, MPS_FAULT, "%s: failed to copy "
 				    "IOCTL data to user space\n", __func__);
 		}
 	}
 
 	/*
 	 * Copy the reply data and sense data to user space.
 	 */
 	if (cm->cm_reply != NULL) {
 		rpl = (MPI2_DEFAULT_REPLY *)cm->cm_reply;
 		sz = rpl->MsgLength * 4;
 
 		if (sz > data->ReplySize) {
 			mps_printf(sc, "%s: user reply buffer (%d) smaller "
 			    "than returned buffer (%d)\n", __func__,
 			    data->ReplySize, sz);
 		}
 		mps_unlock(sc);
 		copyout(cm->cm_reply, PTRIN(data->PtrReply), data->ReplySize);
 		mps_lock(sc);
 
 		if ((function == MPI2_FUNCTION_SCSI_IO_REQUEST) ||
 		    (function == MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH)) {
 			if (((MPI2_SCSI_IO_REPLY *)rpl)->SCSIState &
 			    MPI2_SCSI_STATE_AUTOSENSE_VALID) {
 				sense_len =
 				    MIN((le32toh(((MPI2_SCSI_IO_REPLY *)rpl)->SenseCount)),
 				    sizeof(struct scsi_sense_data));
 				mps_unlock(sc);
 				copyout(cm->cm_sense, cm->cm_req + 64, sense_len);
 				mps_lock(sc);
 			}
 		}
 	}
 	mps_unlock(sc);
 
 RetFreeUnlocked:
 	mps_lock(sc);
 
 	if (cm != NULL) {
 		if (cm->cm_data)
 			free(cm->cm_data, M_MPSUSER);
 		mps_free_command(sc, cm);
 	}
 Ret:
 	sc->mps_flags &= ~MPS_FLAGS_BUSY;
 	mps_unlock(sc);
 
 	return (err);
 }
 
 static void
 mps_user_get_adapter_data(struct mps_softc *sc, mps_adapter_data_t *data)
 {
 	Mpi2ConfigReply_t	mpi_reply;
 	Mpi2BiosPage3_t		config_page;
 
 	/*
 	 * Use the PCI interface functions to get the Bus, Device, and Function
 	 * information.
 	 */
 	data->PciInformation.u.bits.BusNumber = pci_get_bus(sc->mps_dev);
 	data->PciInformation.u.bits.DeviceNumber = pci_get_slot(sc->mps_dev);
 	data->PciInformation.u.bits.FunctionNumber =
 	    pci_get_function(sc->mps_dev);
 
 	/*
 	 * Get the FW version that should already be saved in IOC Facts.
 	 */
 	data->MpiFirmwareVersion = sc->facts->FWVersion.Word;
 
 	/*
 	 * General device info.
 	 */
 	data->AdapterType = MPSIOCTL_ADAPTER_TYPE_SAS2;
 	if (sc->mps_flags & MPS_FLAGS_WD_AVAILABLE)
 		data->AdapterType = MPSIOCTL_ADAPTER_TYPE_SAS2_SSS6200;
 	data->PCIDeviceHwId = pci_get_device(sc->mps_dev);
 	data->PCIDeviceHwRev = pci_read_config(sc->mps_dev, PCIR_REVID, 1);
 	data->SubSystemId = pci_get_subdevice(sc->mps_dev);
 	data->SubsystemVendorId = pci_get_subvendor(sc->mps_dev);
 
 	/*
 	 * Get the driver version.
 	 */
 	strcpy((char *)&data->DriverVersion[0], MPS_DRIVER_VERSION);
 
 	/*
 	 * Need to get BIOS Config Page 3 for the BIOS Version.
 	 */
 	data->BiosVersion = 0;
 	mps_lock(sc);
 	if (mps_config_get_bios_pg3(sc, &mpi_reply, &config_page))
 		printf("%s: Error while retrieving BIOS Version\n", __func__);
 	else
 		data->BiosVersion = config_page.BiosVersion;
 	mps_unlock(sc);
 }
 
 static void
 mps_user_read_pci_info(struct mps_softc *sc, mps_pci_info_t *data)
 {
 	int	i;
 
 	/*
 	 * Use the PCI interface functions to get the Bus, Device, and Function
 	 * information.
 	 */
 	data->BusNumber = pci_get_bus(sc->mps_dev);
 	data->DeviceNumber = pci_get_slot(sc->mps_dev);
 	data->FunctionNumber = pci_get_function(sc->mps_dev);
 
 	/*
 	 * Now get the interrupt vector and the pci header.  The vector can
 	 * only be 0 right now.  The header is the first 256 bytes of config
 	 * space.
 	 */
 	data->InterruptVector = 0;
 	for (i = 0; i < sizeof (data->PciHeader); i++) {
 		data->PciHeader[i] = pci_read_config(sc->mps_dev, i, 1);
 	}
 }
 
 static uint8_t
 mps_get_fw_diag_buffer_number(struct mps_softc *sc, uint32_t unique_id)
 {
 	uint8_t	index;
 
 	for (index = 0; index < MPI2_DIAG_BUF_TYPE_COUNT; index++) {
 		if (sc->fw_diag_buffer_list[index].unique_id == unique_id) {
 			return (index);
 		}
 	}
 
 	return (MPS_FW_DIAGNOSTIC_UID_NOT_FOUND);
 }
 
 static int
 mps_post_fw_diag_buffer(struct mps_softc *sc,
     mps_fw_diagnostic_buffer_t *pBuffer, uint32_t *return_code)
 {
 	MPI2_DIAG_BUFFER_POST_REQUEST	*req;
 	MPI2_DIAG_BUFFER_POST_REPLY	*reply;
 	struct mps_command		*cm = NULL;
 	int				i, status;
 
 	/*
 	 * If buffer is not enabled, just leave.
 	 */
 	*return_code = MPS_FW_DIAG_ERROR_POST_FAILED;
 	if (!pBuffer->enabled) {
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * Clear some flags initially.
 	 */
 	pBuffer->force_release = FALSE;
 	pBuffer->valid_data = FALSE;
 	pBuffer->owned_by_firmware = FALSE;
 
 	/*
 	 * Get a command.
 	 */
 	cm = mps_alloc_command(sc);
 	if (cm == NULL) {
 		mps_printf(sc, "%s: no mps requests\n", __func__);
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * Build the request for releasing the FW Diag Buffer and send it.
 	 */
 	req = (MPI2_DIAG_BUFFER_POST_REQUEST *)cm->cm_req;
 	req->Function = MPI2_FUNCTION_DIAG_BUFFER_POST;
 	req->BufferType = pBuffer->buffer_type;
 	req->ExtendedType = pBuffer->extended_type;
 	req->BufferLength = pBuffer->size;
 	for (i = 0; i < (sizeof(req->ProductSpecific) / 4); i++)
 		req->ProductSpecific[i] = pBuffer->product_specific[i];
 	mps_from_u64(sc->fw_diag_busaddr, &req->BufferAddress);
 	cm->cm_data = NULL;
 	cm->cm_length = 0;
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	cm->cm_complete_data = NULL;
 
 	/*
 	 * Send command synchronously.
 	 */
 	status = mps_wait_command(sc, cm, 30, CAN_SLEEP);
 	if (status) {
 		mps_printf(sc, "%s: invalid request: error %d\n", __func__,
 		    status);
 		status = MPS_DIAG_FAILURE;
 		goto done;
 	}
 
 	/*
 	 * Process POST reply.
 	 */
 	reply = (MPI2_DIAG_BUFFER_POST_REPLY *)cm->cm_reply;
 	if (reply->IOCStatus != MPI2_IOCSTATUS_SUCCESS) {
 		status = MPS_DIAG_FAILURE;
 		mps_dprint(sc, MPS_FAULT, "%s: post of FW  Diag Buffer failed "
 		    "with IOCStatus = 0x%x, IOCLogInfo = 0x%x and "
 		    "TransferLength = 0x%x\n", __func__, reply->IOCStatus,
 		    reply->IOCLogInfo, reply->TransferLength);
 		goto done;
 	}
 
 	/*
 	 * Post was successful.
 	 */
 	pBuffer->valid_data = TRUE;
 	pBuffer->owned_by_firmware = TRUE;
 	*return_code = MPS_FW_DIAG_ERROR_SUCCESS;
 	status = MPS_DIAG_SUCCESS;
 
 done:
 	mps_free_command(sc, cm);
 	return (status);
 }
 
 static int
 mps_release_fw_diag_buffer(struct mps_softc *sc,
     mps_fw_diagnostic_buffer_t *pBuffer, uint32_t *return_code,
     uint32_t diag_type)
 {
 	MPI2_DIAG_RELEASE_REQUEST	*req;
 	MPI2_DIAG_RELEASE_REPLY		*reply;
 	struct mps_command		*cm = NULL;
 	int				status;
 
 	/*
 	 * If buffer is not enabled, just leave.
 	 */
 	*return_code = MPS_FW_DIAG_ERROR_RELEASE_FAILED;
 	if (!pBuffer->enabled) {
 		mps_dprint(sc, MPS_USER, "%s: This buffer type is not "
 		    "supported by the IOC", __func__);
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * Clear some flags initially.
 	 */
 	pBuffer->force_release = FALSE;
 	pBuffer->valid_data = FALSE;
 	pBuffer->owned_by_firmware = FALSE;
 
 	/*
 	 * Get a command.
 	 */
 	cm = mps_alloc_command(sc);
 	if (cm == NULL) {
 		mps_printf(sc, "%s: no mps requests\n", __func__);
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * Build the request for releasing the FW Diag Buffer and send it.
 	 */
 	req = (MPI2_DIAG_RELEASE_REQUEST *)cm->cm_req;
 	req->Function = MPI2_FUNCTION_DIAG_RELEASE;
 	req->BufferType = pBuffer->buffer_type;
 	cm->cm_data = NULL;
 	cm->cm_length = 0;
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	cm->cm_complete_data = NULL;
 
 	/*
 	 * Send command synchronously.
 	 */
 	status = mps_wait_command(sc, cm, 30, CAN_SLEEP);
 	if (status) {
 		mps_printf(sc, "%s: invalid request: error %d\n", __func__,
 		    status);
 		status = MPS_DIAG_FAILURE;
 		goto done;
 	}
 
 	/*
 	 * Process RELEASE reply.
 	 */
 	reply = (MPI2_DIAG_RELEASE_REPLY *)cm->cm_reply;
 	if ((reply->IOCStatus != MPI2_IOCSTATUS_SUCCESS) ||
 	    pBuffer->owned_by_firmware) {
 		status = MPS_DIAG_FAILURE;
 		mps_dprint(sc, MPS_FAULT, "%s: release of FW Diag Buffer "
 		    "failed with IOCStatus = 0x%x and IOCLogInfo = 0x%x\n",
 		    __func__, reply->IOCStatus, reply->IOCLogInfo);
 		goto done;
 	}
 
 	/*
 	 * Release was successful.
 	 */
 	*return_code = MPS_FW_DIAG_ERROR_SUCCESS;
 	status = MPS_DIAG_SUCCESS;
 
 	/*
 	 * If this was for an UNREGISTER diag type command, clear the unique ID.
 	 */
 	if (diag_type == MPS_FW_DIAG_TYPE_UNREGISTER) {
 		pBuffer->unique_id = MPS_FW_DIAG_INVALID_UID;
 	}
 
 done:
 	return (status);
 }
 
 static int
 mps_diag_register(struct mps_softc *sc, mps_fw_diag_register_t *diag_register,
     uint32_t *return_code)
 {
 	mps_fw_diagnostic_buffer_t	*pBuffer;
 	uint8_t				extended_type, buffer_type, i;
 	uint32_t			buffer_size;
 	uint32_t			unique_id;
 	int				status;
 
 	extended_type = diag_register->ExtendedType;
 	buffer_type = diag_register->BufferType;
 	buffer_size = diag_register->RequestedBufferSize;
 	unique_id = diag_register->UniqueId;
 
 	/*
 	 * Check for valid buffer type
 	 */
 	if (buffer_type >= MPI2_DIAG_BUF_TYPE_COUNT) {
 		*return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER;
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * Get the current buffer and look up the unique ID.  The unique ID
 	 * should not be found.  If it is, the ID is already in use.
 	 */
 	i = mps_get_fw_diag_buffer_number(sc, unique_id);
 	pBuffer = &sc->fw_diag_buffer_list[buffer_type];
 	if (i != MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) {
 		*return_code = MPS_FW_DIAG_ERROR_INVALID_UID;
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * The buffer's unique ID should not be registered yet, and the given
 	 * unique ID cannot be 0.
 	 */
 	if ((pBuffer->unique_id != MPS_FW_DIAG_INVALID_UID) ||
 	    (unique_id == MPS_FW_DIAG_INVALID_UID)) {
 		*return_code = MPS_FW_DIAG_ERROR_INVALID_UID;
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * If this buffer is already posted as immediate, just change owner.
 	 */
 	if (pBuffer->immediate && pBuffer->owned_by_firmware &&
 	    (pBuffer->unique_id == MPS_FW_DIAG_INVALID_UID)) {
 		pBuffer->immediate = FALSE;
 		pBuffer->unique_id = unique_id;
 		return (MPS_DIAG_SUCCESS);
 	}
 
 	/*
 	 * Post a new buffer after checking if it's enabled.  The DMA buffer
 	 * that is allocated will be contiguous (nsegments = 1).
 	 */
 	if (!pBuffer->enabled) {
 		*return_code = MPS_FW_DIAG_ERROR_NO_BUFFER;
 		return (MPS_DIAG_FAILURE);
 	}
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				1, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 buffer_size,		/* maxsize */
                                 1,			/* nsegments */
                                 buffer_size,		/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->fw_diag_dmat)) {
 		device_printf(sc->mps_dev, "Cannot allocate FW diag buffer DMA "
 		    "tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->fw_diag_dmat, (void **)&sc->fw_diag_buffer,
 	    BUS_DMA_NOWAIT, &sc->fw_diag_map)) {
 		device_printf(sc->mps_dev, "Cannot allocate FW diag buffer "
 		    "memory\n");
 		return (ENOMEM);
         }
         bzero(sc->fw_diag_buffer, buffer_size);
         bus_dmamap_load(sc->fw_diag_dmat, sc->fw_diag_map, sc->fw_diag_buffer,
 	    buffer_size, mps_memaddr_cb, &sc->fw_diag_busaddr, 0);
 	pBuffer->size = buffer_size;
 
 	/*
 	 * Copy the given info to the diag buffer and post the buffer.
 	 */
 	pBuffer->buffer_type = buffer_type;
 	pBuffer->immediate = FALSE;
 	if (buffer_type == MPI2_DIAG_BUF_TYPE_TRACE) {
 		for (i = 0; i < (sizeof (pBuffer->product_specific) / 4);
 		    i++) {
 			pBuffer->product_specific[i] =
 			    diag_register->ProductSpecific[i];
 		}
 	}
 	pBuffer->extended_type = extended_type;
 	pBuffer->unique_id = unique_id;
 	status = mps_post_fw_diag_buffer(sc, pBuffer, return_code);
 
 	/*
 	 * In case there was a failure, free the DMA buffer.
 	 */
 	if (status == MPS_DIAG_FAILURE) {
 		if (sc->fw_diag_busaddr != 0)
 			bus_dmamap_unload(sc->fw_diag_dmat, sc->fw_diag_map);
 		if (sc->fw_diag_buffer != NULL)
 			bus_dmamem_free(sc->fw_diag_dmat, sc->fw_diag_buffer,
 			    sc->fw_diag_map);
 		if (sc->fw_diag_dmat != NULL)
 			bus_dma_tag_destroy(sc->fw_diag_dmat);
 	}
 
 	return (status);
 }
 
 static int
 mps_diag_unregister(struct mps_softc *sc,
     mps_fw_diag_unregister_t *diag_unregister, uint32_t *return_code)
 {
 	mps_fw_diagnostic_buffer_t	*pBuffer;
 	uint8_t				i;
 	uint32_t			unique_id;
 	int				status;
 
 	unique_id = diag_unregister->UniqueId;
 
 	/*
 	 * Get the current buffer and look up the unique ID.  The unique ID
 	 * should be there.
 	 */
 	i = mps_get_fw_diag_buffer_number(sc, unique_id);
 	if (i == MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) {
 		*return_code = MPS_FW_DIAG_ERROR_INVALID_UID;
 		return (MPS_DIAG_FAILURE);
 	}
 
 	pBuffer = &sc->fw_diag_buffer_list[i];
 
 	/*
 	 * Try to release the buffer from FW before freeing it.  If release
 	 * fails, don't free the DMA buffer in case FW tries to access it
 	 * later.  If buffer is not owned by firmware, can't release it.
 	 */
 	if (!pBuffer->owned_by_firmware) {
 		status = MPS_DIAG_SUCCESS;
 	} else {
 		status = mps_release_fw_diag_buffer(sc, pBuffer, return_code,
 		    MPS_FW_DIAG_TYPE_UNREGISTER);
 	}
 
 	/*
 	 * At this point, return the current status no matter what happens with
 	 * the DMA buffer.
 	 */
 	pBuffer->unique_id = MPS_FW_DIAG_INVALID_UID;
 	if (status == MPS_DIAG_SUCCESS) {
 		if (sc->fw_diag_busaddr != 0)
 			bus_dmamap_unload(sc->fw_diag_dmat, sc->fw_diag_map);
 		if (sc->fw_diag_buffer != NULL)
 			bus_dmamem_free(sc->fw_diag_dmat, sc->fw_diag_buffer,
 			    sc->fw_diag_map);
 		if (sc->fw_diag_dmat != NULL)
 			bus_dma_tag_destroy(sc->fw_diag_dmat);
 	}
 
 	return (status);
 }
 
 static int
 mps_diag_query(struct mps_softc *sc, mps_fw_diag_query_t *diag_query,
     uint32_t *return_code)
 {
 	mps_fw_diagnostic_buffer_t	*pBuffer;
 	uint8_t				i;
 	uint32_t			unique_id;
 
 	unique_id = diag_query->UniqueId;
 
 	/*
 	 * If ID is valid, query on ID.
 	 * If ID is invalid, query on buffer type.
 	 */
 	if (unique_id == MPS_FW_DIAG_INVALID_UID) {
 		i = diag_query->BufferType;
 		if (i >= MPI2_DIAG_BUF_TYPE_COUNT) {
 			*return_code = MPS_FW_DIAG_ERROR_INVALID_UID;
 			return (MPS_DIAG_FAILURE);
 		}
 	} else {
 		i = mps_get_fw_diag_buffer_number(sc, unique_id);
 		if (i == MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) {
 			*return_code = MPS_FW_DIAG_ERROR_INVALID_UID;
 			return (MPS_DIAG_FAILURE);
 		}
 	}
 
 	/*
 	 * Fill query structure with the diag buffer info.
 	 */
 	pBuffer = &sc->fw_diag_buffer_list[i];
 	diag_query->BufferType = pBuffer->buffer_type;
 	diag_query->ExtendedType = pBuffer->extended_type;
 	if (diag_query->BufferType == MPI2_DIAG_BUF_TYPE_TRACE) {
 		for (i = 0; i < (sizeof(diag_query->ProductSpecific) / 4);
 		    i++) {
 			diag_query->ProductSpecific[i] =
 			    pBuffer->product_specific[i];
 		}
 	}
 	diag_query->TotalBufferSize = pBuffer->size;
 	diag_query->DriverAddedBufferSize = 0;
 	diag_query->UniqueId = pBuffer->unique_id;
 	diag_query->ApplicationFlags = 0;
 	diag_query->DiagnosticFlags = 0;
 
 	/*
 	 * Set/Clear application flags
 	 */
 	if (pBuffer->immediate) {
 		diag_query->ApplicationFlags &= ~MPS_FW_DIAG_FLAG_APP_OWNED;
 	} else {
 		diag_query->ApplicationFlags |= MPS_FW_DIAG_FLAG_APP_OWNED;
 	}
 	if (pBuffer->valid_data || pBuffer->owned_by_firmware) {
 		diag_query->ApplicationFlags |= MPS_FW_DIAG_FLAG_BUFFER_VALID;
 	} else {
 		diag_query->ApplicationFlags &= ~MPS_FW_DIAG_FLAG_BUFFER_VALID;
 	}
 	if (pBuffer->owned_by_firmware) {
 		diag_query->ApplicationFlags |=
 		    MPS_FW_DIAG_FLAG_FW_BUFFER_ACCESS;
 	} else {
 		diag_query->ApplicationFlags &=
 		    ~MPS_FW_DIAG_FLAG_FW_BUFFER_ACCESS;
 	}
 
 	return (MPS_DIAG_SUCCESS);
 }
 
 static int
 mps_diag_read_buffer(struct mps_softc *sc,
     mps_diag_read_buffer_t *diag_read_buffer, uint8_t *ioctl_buf,
     uint32_t *return_code)
 {
 	mps_fw_diagnostic_buffer_t	*pBuffer;
 	uint8_t				i, *pData;
 	uint32_t			unique_id;
 	int				status;
 
 	unique_id = diag_read_buffer->UniqueId;
 
 	/*
 	 * Get the current buffer and look up the unique ID.  The unique ID
 	 * should be there.
 	 */
 	i = mps_get_fw_diag_buffer_number(sc, unique_id);
 	if (i == MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) {
 		*return_code = MPS_FW_DIAG_ERROR_INVALID_UID;
 		return (MPS_DIAG_FAILURE);
 	}
 
 	pBuffer = &sc->fw_diag_buffer_list[i];
 
 	/*
 	 * Make sure requested read is within limits
 	 */
 	if (diag_read_buffer->StartingOffset + diag_read_buffer->BytesToRead >
 	    pBuffer->size) {
 		*return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER;
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * Copy the requested data from DMA to the diag_read_buffer.  The DMA
 	 * buffer that was allocated is one contiguous buffer.
 	 */
 	pData = (uint8_t *)(sc->fw_diag_buffer +
 	    diag_read_buffer->StartingOffset);
 	if (copyout(pData, ioctl_buf, diag_read_buffer->BytesToRead) != 0)
 		return (MPS_DIAG_FAILURE);
 	diag_read_buffer->Status = 0;
 
 	/*
 	 * Set or clear the Force Release flag.
 	 */
 	if (pBuffer->force_release) {
 		diag_read_buffer->Flags |= MPS_FW_DIAG_FLAG_FORCE_RELEASE;
 	} else {
 		diag_read_buffer->Flags &= ~MPS_FW_DIAG_FLAG_FORCE_RELEASE;
 	}
 
 	/*
 	 * If buffer is to be reregistered, make sure it's not already owned by
 	 * firmware first.
 	 */
 	status = MPS_DIAG_SUCCESS;
 	if (!pBuffer->owned_by_firmware) {
 		if (diag_read_buffer->Flags & MPS_FW_DIAG_FLAG_REREGISTER) {
 			status = mps_post_fw_diag_buffer(sc, pBuffer,
 			    return_code);
 		}
 	}
 
 	return (status);
 }
 
 static int
 mps_diag_release(struct mps_softc *sc, mps_fw_diag_release_t *diag_release,
     uint32_t *return_code)
 {
 	mps_fw_diagnostic_buffer_t	*pBuffer;
 	uint8_t				i;
 	uint32_t			unique_id;
 	int				status;
 
 	unique_id = diag_release->UniqueId;
 
 	/*
 	 * Get the current buffer and look up the unique ID.  The unique ID
 	 * should be there.
 	 */
 	i = mps_get_fw_diag_buffer_number(sc, unique_id);
 	if (i == MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) {
 		*return_code = MPS_FW_DIAG_ERROR_INVALID_UID;
 		return (MPS_DIAG_FAILURE);
 	}
 
 	pBuffer = &sc->fw_diag_buffer_list[i];
 
 	/*
 	 * If buffer is not owned by firmware, it's already been released.
 	 */
 	if (!pBuffer->owned_by_firmware) {
 		*return_code = MPS_FW_DIAG_ERROR_ALREADY_RELEASED;
 		return (MPS_DIAG_FAILURE);
 	}
 
 	/*
 	 * Release the buffer.
 	 */
 	status = mps_release_fw_diag_buffer(sc, pBuffer, return_code,
 	    MPS_FW_DIAG_TYPE_RELEASE);
 	return (status);
 }
 
 static int
 mps_do_diag_action(struct mps_softc *sc, uint32_t action, uint8_t *diag_action,
     uint32_t length, uint32_t *return_code)
 {
 	mps_fw_diag_register_t		diag_register;
 	mps_fw_diag_unregister_t	diag_unregister;
 	mps_fw_diag_query_t		diag_query;
 	mps_diag_read_buffer_t		diag_read_buffer;
 	mps_fw_diag_release_t		diag_release;
 	int				status = MPS_DIAG_SUCCESS;
 	uint32_t			original_return_code;
 
 	original_return_code = *return_code;
 	*return_code = MPS_FW_DIAG_ERROR_SUCCESS;
 
 	switch (action) {
 		case MPS_FW_DIAG_TYPE_REGISTER:
 			if (!length) {
 				*return_code =
 				    MPS_FW_DIAG_ERROR_INVALID_PARAMETER;
 				status = MPS_DIAG_FAILURE;
 				break;
 			}
 			if (copyin(diag_action, &diag_register,
 			    sizeof(diag_register)) != 0)
 				return (MPS_DIAG_FAILURE);
 			status = mps_diag_register(sc, &diag_register,
 			    return_code);
 			break;
 
 		case MPS_FW_DIAG_TYPE_UNREGISTER:
 			if (length < sizeof(diag_unregister)) {
 				*return_code =
 				    MPS_FW_DIAG_ERROR_INVALID_PARAMETER;
 				status = MPS_DIAG_FAILURE;
 				break;
 			}
 			if (copyin(diag_action, &diag_unregister,
 			    sizeof(diag_unregister)) != 0)
 				return (MPS_DIAG_FAILURE);
 			status = mps_diag_unregister(sc, &diag_unregister,
 			    return_code);
 			break;
 
 		case MPS_FW_DIAG_TYPE_QUERY:
 			if (length < sizeof (diag_query)) {
 				*return_code =
 				    MPS_FW_DIAG_ERROR_INVALID_PARAMETER;
 				status = MPS_DIAG_FAILURE;
 				break;
 			}
 			if (copyin(diag_action, &diag_query, sizeof(diag_query))
 			    != 0)
 				return (MPS_DIAG_FAILURE);
 			status = mps_diag_query(sc, &diag_query, return_code);
 			if (status == MPS_DIAG_SUCCESS)
 				if (copyout(&diag_query, diag_action,
 				    sizeof (diag_query)) != 0)
 					return (MPS_DIAG_FAILURE);
 			break;
 
 		case MPS_FW_DIAG_TYPE_READ_BUFFER:
 			if (copyin(diag_action, &diag_read_buffer,
 			    sizeof(diag_read_buffer)) != 0)
 				return (MPS_DIAG_FAILURE);
 			if (length < diag_read_buffer.BytesToRead) {
 				*return_code =
 				    MPS_FW_DIAG_ERROR_INVALID_PARAMETER;
 				status = MPS_DIAG_FAILURE;
 				break;
 			}
 			status = mps_diag_read_buffer(sc, &diag_read_buffer,
 			    PTRIN(diag_read_buffer.PtrDataBuffer),
 			    return_code);
 			if (status == MPS_DIAG_SUCCESS) {
 				if (copyout(&diag_read_buffer, diag_action,
 				    sizeof(diag_read_buffer) -
 				    sizeof(diag_read_buffer.PtrDataBuffer)) !=
 				    0)
 					return (MPS_DIAG_FAILURE);
 			}
 			break;
 
 		case MPS_FW_DIAG_TYPE_RELEASE:
 			if (length < sizeof(diag_release)) {
 				*return_code =
 				    MPS_FW_DIAG_ERROR_INVALID_PARAMETER;
 				status = MPS_DIAG_FAILURE;
 				break;
 			}
 			if (copyin(diag_action, &diag_release,
 			    sizeof(diag_release)) != 0)
 				return (MPS_DIAG_FAILURE);
 			status = mps_diag_release(sc, &diag_release,
 			    return_code);
 			break;
 
 		default:
 			*return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER;
 			status = MPS_DIAG_FAILURE;
 			break;
 	}
 
 	if ((status == MPS_DIAG_FAILURE) &&
 	    (original_return_code == MPS_FW_DIAG_NEW) &&
 	    (*return_code != MPS_FW_DIAG_ERROR_SUCCESS))
 		status = MPS_DIAG_SUCCESS;
 
 	return (status);
 }
 
 static int
 mps_user_diag_action(struct mps_softc *sc, mps_diag_action_t *data)
 {
 	int			status;
 
 	/*
 	 * Only allow one diag action at one time.
 	 */
 	if (sc->mps_flags & MPS_FLAGS_BUSY) {
 		mps_dprint(sc, MPS_USER, "%s: Only one FW diag command "
 		    "allowed at a single time.", __func__);
 		return (EBUSY);
 	}
 	sc->mps_flags |= MPS_FLAGS_BUSY;
 
 	/*
 	 * Send diag action request
 	 */
 	if (data->Action == MPS_FW_DIAG_TYPE_REGISTER ||
 	    data->Action == MPS_FW_DIAG_TYPE_UNREGISTER ||
 	    data->Action == MPS_FW_DIAG_TYPE_QUERY ||
 	    data->Action == MPS_FW_DIAG_TYPE_READ_BUFFER ||
 	    data->Action == MPS_FW_DIAG_TYPE_RELEASE) {
 		status = mps_do_diag_action(sc, data->Action,
 		    PTRIN(data->PtrDiagAction), data->Length,
 		    &data->ReturnCode);
 	} else
 		status = EINVAL;
 
 	sc->mps_flags &= ~MPS_FLAGS_BUSY;
 	return (status);
 }
 
 /*
  * Copy the event recording mask and the event queue size out.  For
  * clarification, the event recording mask (events_to_record) is not the same
  * thing as the event mask (event_mask).  events_to_record has a bit set for
  * every event type that is to be recorded by the driver, and event_mask has a
  * bit cleared for every event that is allowed into the driver from the IOC.
  * They really have nothing to do with each other.
  */
 static void
 mps_user_event_query(struct mps_softc *sc, mps_event_query_t *data)
 {
 	uint8_t	i;
 
 	mps_lock(sc);
 	data->Entries = MPS_EVENT_QUEUE_SIZE;
 
 	for (i = 0; i < 4; i++) {
 		data->Types[i] = sc->events_to_record[i];
 	}
 	mps_unlock(sc);
 }
 
 /*
  * Set the driver's event mask according to what's been given.  See
  * mps_user_event_query for explanation of the event recording mask and the IOC
  * event mask.  It's the app's responsibility to enable event logging by setting
  * the bits in events_to_record.  Initially, no events will be logged.
  */
 static void
 mps_user_event_enable(struct mps_softc *sc, mps_event_enable_t *data)
 {
 	uint8_t	i;
 
 	mps_lock(sc);
 	for (i = 0; i < 4; i++) {
 		sc->events_to_record[i] = data->Types[i];
 	}
 	mps_unlock(sc);
 }
 
 /*
  * Copy out the events that have been recorded, up to the max events allowed.
  */
 static int
 mps_user_event_report(struct mps_softc *sc, mps_event_report_t *data)
 {
 	int		status = 0;
 	uint32_t	size;
 
 	mps_lock(sc);
 	size = data->Size;
 	if ((size >= sizeof(sc->recorded_events)) && (status == 0)) {
 		mps_unlock(sc);
 		if (copyout((void *)sc->recorded_events,
 		    PTRIN(data->PtrEvents), size) != 0)
 			status = EFAULT;
 		mps_lock(sc);
 	} else {
 		/*
 		 * data->Size value is not large enough to copy event data.
 		 */
 		status = EFAULT;
 	}
 
 	/*
 	 * Change size value to match the number of bytes that were copied.
 	 */
 	if (status == 0)
 		data->Size = sizeof(sc->recorded_events);
 	mps_unlock(sc);
 
 	return (status);
 }
 
 /*
  * Record events into the driver from the IOC if they are not masked.
  */
 void
 mpssas_record_event(struct mps_softc *sc,
     MPI2_EVENT_NOTIFICATION_REPLY *event_reply)
 {
 	uint32_t	event;
 	int		i, j;
 	uint16_t	event_data_len;
 	boolean_t	sendAEN = FALSE;
 
 	event = event_reply->Event;
 
 	/*
 	 * Generate a system event to let anyone who cares know that a
 	 * LOG_ENTRY_ADDED event has occurred.  This is sent no matter what the
 	 * event mask is set to.
 	 */
 	if (event == MPI2_EVENT_LOG_ENTRY_ADDED) {
 		sendAEN = TRUE;
 	}
 
 	/*
 	 * Record the event only if its corresponding bit is set in
 	 * events_to_record.  event_index is the index into recorded_events and
 	 * event_number is the overall number of an event being recorded since
 	 * start-of-day.  event_index will roll over; event_number will never
 	 * roll over.
 	 */
 	i = (uint8_t)(event / 32);
 	j = (uint8_t)(event % 32);
 	if ((i < 4) && ((1 << j) & sc->events_to_record[i])) {
 		i = sc->event_index;
 		sc->recorded_events[i].Type = event;
 		sc->recorded_events[i].Number = ++sc->event_number;
 		bzero(sc->recorded_events[i].Data, MPS_MAX_EVENT_DATA_LENGTH *
 		    4);
 		event_data_len = event_reply->EventDataLength;
 
 		if (event_data_len > 0) {
 			/*
 			 * Limit data to size in m_event entry
 			 */
 			if (event_data_len > MPS_MAX_EVENT_DATA_LENGTH) {
 				event_data_len = MPS_MAX_EVENT_DATA_LENGTH;
 			}
 			for (j = 0; j < event_data_len; j++) {
 				sc->recorded_events[i].Data[j] =
 				    event_reply->EventData[j];
 			}
 
 			/*
 			 * check for index wrap-around
 			 */
 			if (++i == MPS_EVENT_QUEUE_SIZE) {
 				i = 0;
 			}
 			sc->event_index = (uint8_t)i;
 
 			/*
 			 * Set flag to send the event.
 			 */
 			sendAEN = TRUE;
 		}
 	}
 
 	/*
 	 * Generate a system event if flag is set to let anyone who cares know
 	 * that an event has occurred.
 	 */
 	if (sendAEN) {
 //SLM-how to send a system event (see kqueue, kevent)
 //		(void) ddi_log_sysevent(mpt->m_dip, DDI_VENDOR_LSI, "MPT_SAS",
 //		    "SAS", NULL, NULL, DDI_NOSLEEP);
 	}
 }
 
 static int
 mps_user_reg_access(struct mps_softc *sc, mps_reg_access_t *data)
 {
 	int	status = 0;
 
 	switch (data->Command) {
 		/*
 		 * IO access is not supported.
 		 */
 		case REG_IO_READ:
 		case REG_IO_WRITE:
 			mps_dprint(sc, MPS_USER, "IO access is not supported. "
 			    "Use memory access.");
 			status = EINVAL;
 			break;
 
 		case REG_MEM_READ:
 			data->RegData = mps_regread(sc, data->RegOffset);
 			break;
 
 		case REG_MEM_WRITE:
 			mps_regwrite(sc, data->RegOffset, data->RegData);
 			break;
 
 		default:
 			status = EINVAL;
 			break;
 	}
 
 	return (status);
 }
 
 static int
 mps_user_btdh(struct mps_softc *sc, mps_btdh_mapping_t *data)
 {
 	uint8_t		bt2dh = FALSE;
 	uint8_t		dh2bt = FALSE;
 	uint16_t	dev_handle, bus, target;
 
 	bus = data->Bus;
 	target = data->TargetID;
 	dev_handle = data->DevHandle;
 
 	/*
 	 * When DevHandle is 0xFFFF and Bus/Target are not 0xFFFF, use Bus/
 	 * Target to get DevHandle.  When Bus/Target are 0xFFFF and DevHandle is
 	 * not 0xFFFF, use DevHandle to get Bus/Target.  Anything else is
 	 * invalid.
 	 */
 	if ((bus == 0xFFFF) && (target == 0xFFFF) && (dev_handle != 0xFFFF))
 		dh2bt = TRUE;
 	if ((dev_handle == 0xFFFF) && (bus != 0xFFFF) && (target != 0xFFFF))
 		bt2dh = TRUE;
 	if (!dh2bt && !bt2dh)
 		return (EINVAL);
 
 	/*
 	 * Only handle bus of 0.  Make sure target is within range.
 	 */
 	if (bt2dh) {
 		if (bus != 0)
 			return (EINVAL);
 
 		if (target > sc->max_devices) {
 			mps_dprint(sc, MPS_FAULT, "Target ID is out of range "
 			   "for Bus/Target to DevHandle mapping.");
 			return (EINVAL);
 		}
 		dev_handle = sc->mapping_table[target].dev_handle;
 		if (dev_handle)
 			data->DevHandle = dev_handle;
 	} else {
 		bus = 0;
 		target = mps_mapping_get_sas_id_from_handle(sc, dev_handle);
 		data->Bus = bus;
 		data->TargetID = target;
 	}
 
 	return (0);
 }
 
 static int
 mps_ioctl(struct cdev *dev, u_long cmd, void *arg, int flag,
     struct thread *td)
 {
 	struct mps_softc *sc;
 	struct mps_cfg_page_req *page_req;
 	struct mps_ext_cfg_page_req *ext_page_req;
 	void *mps_page;
 	int error, msleep_ret;
 
 	mps_page = NULL;
 	sc = dev->si_drv1;
 	page_req = (void *)arg;
 	ext_page_req = (void *)arg;
 
 	switch (cmd) {
 	case MPSIO_READ_CFG_HEADER:
 		mps_lock(sc);
 		error = mps_user_read_cfg_header(sc, page_req);
 		mps_unlock(sc);
 		break;
 	case MPSIO_READ_CFG_PAGE:
 		mps_page = malloc(page_req->len, M_MPSUSER, M_WAITOK | M_ZERO);
 		if(!mps_page) {
 			mps_printf(sc, "Cannot allocate memory %s %d\n",
 			 __func__, __LINE__);
 			return (ENOMEM);
     	}
 		error = copyin(page_req->buf, mps_page,
 		    sizeof(MPI2_CONFIG_PAGE_HEADER));
 		if (error)
 			break;
 		mps_lock(sc);
 		error = mps_user_read_cfg_page(sc, page_req, mps_page);
 		mps_unlock(sc);
 		if (error)
 			break;
 		error = copyout(mps_page, page_req->buf, page_req->len);
 		break;
 	case MPSIO_READ_EXT_CFG_HEADER:
 		mps_lock(sc);
 		error = mps_user_read_extcfg_header(sc, ext_page_req);
 		mps_unlock(sc);
 		break;
 	case MPSIO_READ_EXT_CFG_PAGE:
 		mps_page = malloc(ext_page_req->len, M_MPSUSER, M_WAITOK|M_ZERO);
 		if(!mps_page) {
 			mps_printf(sc, "Cannot allocate memory %s %d\n",
 			 __func__, __LINE__);
 			return (ENOMEM);
 	}
 		error = copyin(ext_page_req->buf, mps_page,
 		    sizeof(MPI2_CONFIG_EXTENDED_PAGE_HEADER));
 		if (error)
 			break;
 		mps_lock(sc);
 		error = mps_user_read_extcfg_page(sc, ext_page_req, mps_page);
 		mps_unlock(sc);
 		if (error)
 			break;
 		error = copyout(mps_page, ext_page_req->buf, ext_page_req->len);
 		break;
 	case MPSIO_WRITE_CFG_PAGE:
 		mps_page = malloc(page_req->len, M_MPSUSER, M_WAITOK|M_ZERO);
 		if(!mps_page) {
 			mps_printf(sc, "Cannot allocate memory %s %d\n",
 			 __func__, __LINE__);
 			return (ENOMEM);
 	}
 		error = copyin(page_req->buf, mps_page, page_req->len);
 		if (error)
 			break;
 		mps_lock(sc);
 		error = mps_user_write_cfg_page(sc, page_req, mps_page);
 		mps_unlock(sc);
 		break;
 	case MPSIO_MPS_COMMAND:
 		error = mps_user_command(sc, (struct mps_usr_command *)arg);
 		break;
 	case MPTIOCTL_PASS_THRU:
 		/*
 		 * The user has requested to pass through a command to be
 		 * executed by the MPT firmware.  Call our routine which does
 		 * this.  Only allow one passthru IOCTL at one time.
 		 */
 		error = mps_user_pass_thru(sc, (mps_pass_thru_t *)arg);
 		break;
 	case MPTIOCTL_GET_ADAPTER_DATA:
 		/*
 		 * The user has requested to read adapter data.  Call our
 		 * routine which does this.
 		 */
 		error = 0;
 		mps_user_get_adapter_data(sc, (mps_adapter_data_t *)arg);
 		break;
 	case MPTIOCTL_GET_PCI_INFO:
 		/*
 		 * The user has requested to read pci info.  Call
 		 * our routine which does this.
 		 */
 		mps_lock(sc);
 		error = 0;
 		mps_user_read_pci_info(sc, (mps_pci_info_t *)arg);
 		mps_unlock(sc);
 		break;
 	case MPTIOCTL_RESET_ADAPTER:
 		mps_lock(sc);
 		sc->port_enable_complete = 0;
 		uint32_t reinit_start = time_uptime;
 		error = mps_reinit(sc);
 		/* Sleep for 300 second. */
 		msleep_ret = msleep(&sc->port_enable_complete, &sc->mps_mtx, PRIBIO,
 		       "mps_porten", 300 * hz);
 		mps_unlock(sc);
 		if (msleep_ret)
 			printf("Port Enable did not complete after Diag "
 			    "Reset msleep error %d.\n", msleep_ret);
 		else
 			mps_dprint(sc, MPS_USER,
 				"Hard Reset with Port Enable completed in %d seconds.\n",
 				 (uint32_t) (time_uptime - reinit_start));
 		break;
 	case MPTIOCTL_DIAG_ACTION:
 		/*
 		 * The user has done a diag buffer action.  Call our routine
 		 * which does this.  Only allow one diag action at one time.
 		 */
 		mps_lock(sc);
 		error = mps_user_diag_action(sc, (mps_diag_action_t *)arg);
 		mps_unlock(sc);
 		break;
 	case MPTIOCTL_EVENT_QUERY:
 		/*
 		 * The user has done an event query. Call our routine which does
 		 * this.
 		 */
 		error = 0;
 		mps_user_event_query(sc, (mps_event_query_t *)arg);
 		break;
 	case MPTIOCTL_EVENT_ENABLE:
 		/*
 		 * The user has done an event enable. Call our routine which
 		 * does this.
 		 */
 		error = 0;
 		mps_user_event_enable(sc, (mps_event_enable_t *)arg);
 		break;
 	case MPTIOCTL_EVENT_REPORT:
 		/*
 		 * The user has done an event report. Call our routine which
 		 * does this.
 		 */
 		error = mps_user_event_report(sc, (mps_event_report_t *)arg);
 		break;
 	case MPTIOCTL_REG_ACCESS:
 		/*
 		 * The user has requested register access.  Call our routine
 		 * which does this.
 		 */
 		mps_lock(sc);
 		error = mps_user_reg_access(sc, (mps_reg_access_t *)arg);
 		mps_unlock(sc);
 		break;
 	case MPTIOCTL_BTDH_MAPPING:
 		/*
 		 * The user has requested to translate a bus/target to a
 		 * DevHandle or a DevHandle to a bus/target.  Call our routine
 		 * which does this.
 		 */
 		error = mps_user_btdh(sc, (mps_btdh_mapping_t *)arg);
 		break;
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 
 	if (mps_page != NULL)
 		free(mps_page, M_MPSUSER);
 
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 struct mps_cfg_page_req32 {
 	MPI2_CONFIG_PAGE_HEADER header;
 	uint32_t page_address;
 	uint32_t buf;
 	int	len;	
 	uint16_t ioc_status;
 };
 
 struct mps_ext_cfg_page_req32 {
 	MPI2_CONFIG_EXTENDED_PAGE_HEADER header;
 	uint32_t page_address;
 	uint32_t buf;
 	int	len;
 	uint16_t ioc_status;
 };
 
 struct mps_raid_action32 {
 	uint8_t action;
 	uint8_t volume_bus;
 	uint8_t volume_id;
 	uint8_t phys_disk_num;
 	uint32_t action_data_word;
 	uint32_t buf;
 	int len;
 	uint32_t volume_status;
 	uint32_t action_data[4];
 	uint16_t action_status;
 	uint16_t ioc_status;
 	uint8_t write;
 };
 
 struct mps_usr_command32 {
 	uint32_t req;
 	uint32_t req_len;
 	uint32_t rpl;
 	uint32_t rpl_len;
 	uint32_t buf;
 	int len;
 	uint32_t flags;
 };
 
 #define	MPSIO_READ_CFG_HEADER32	_IOWR('M', 200, struct mps_cfg_page_req32)
 #define	MPSIO_READ_CFG_PAGE32	_IOWR('M', 201, struct mps_cfg_page_req32)
 #define	MPSIO_READ_EXT_CFG_HEADER32 _IOWR('M', 202, struct mps_ext_cfg_page_req32)
 #define	MPSIO_READ_EXT_CFG_PAGE32 _IOWR('M', 203, struct mps_ext_cfg_page_req32)
 #define	MPSIO_WRITE_CFG_PAGE32	_IOWR('M', 204, struct mps_cfg_page_req32)
 #define	MPSIO_RAID_ACTION32	_IOWR('M', 205, struct mps_raid_action32)
 #define	MPSIO_MPS_COMMAND32	_IOWR('M', 210, struct mps_usr_command32)
 
 static int
 mps_ioctl32(struct cdev *dev, u_long cmd32, void *_arg, int flag,
     struct thread *td)
 {
 	struct mps_cfg_page_req32 *page32 = _arg;
 	struct mps_ext_cfg_page_req32 *ext32 = _arg;
 	struct mps_raid_action32 *raid32 = _arg;
 	struct mps_usr_command32 *user32 = _arg;
 	union {
 		struct mps_cfg_page_req page;
 		struct mps_ext_cfg_page_req ext;
 		struct mps_raid_action raid;
 		struct mps_usr_command user;
 	} arg;
 	u_long cmd;
 	int error;
 
 	switch (cmd32) {
 	case MPSIO_READ_CFG_HEADER32:
 	case MPSIO_READ_CFG_PAGE32:
 	case MPSIO_WRITE_CFG_PAGE32:
 		if (cmd32 == MPSIO_READ_CFG_HEADER32)
 			cmd = MPSIO_READ_CFG_HEADER;
 		else if (cmd32 == MPSIO_READ_CFG_PAGE32)
 			cmd = MPSIO_READ_CFG_PAGE;
 		else
 			cmd = MPSIO_WRITE_CFG_PAGE;
 		CP(*page32, arg.page, header);
 		CP(*page32, arg.page, page_address);
 		PTRIN_CP(*page32, arg.page, buf);
 		CP(*page32, arg.page, len);
 		CP(*page32, arg.page, ioc_status);
 		break;
 
 	case MPSIO_READ_EXT_CFG_HEADER32:
 	case MPSIO_READ_EXT_CFG_PAGE32:
 		if (cmd32 == MPSIO_READ_EXT_CFG_HEADER32)
 			cmd = MPSIO_READ_EXT_CFG_HEADER;
 		else
 			cmd = MPSIO_READ_EXT_CFG_PAGE;
 		CP(*ext32, arg.ext, header);
 		CP(*ext32, arg.ext, page_address);
 		PTRIN_CP(*ext32, arg.ext, buf);
 		CP(*ext32, arg.ext, len);
 		CP(*ext32, arg.ext, ioc_status);
 		break;
 
 	case MPSIO_RAID_ACTION32:
 		cmd = MPSIO_RAID_ACTION;
 		CP(*raid32, arg.raid, action);
 		CP(*raid32, arg.raid, volume_bus);
 		CP(*raid32, arg.raid, volume_id);
 		CP(*raid32, arg.raid, phys_disk_num);
 		CP(*raid32, arg.raid, action_data_word);
 		PTRIN_CP(*raid32, arg.raid, buf);
 		CP(*raid32, arg.raid, len);
 		CP(*raid32, arg.raid, volume_status);
 		bcopy(raid32->action_data, arg.raid.action_data,
 		    sizeof arg.raid.action_data);
 		CP(*raid32, arg.raid, ioc_status);
 		CP(*raid32, arg.raid, write);
 		break;
 
 	case MPSIO_MPS_COMMAND32:
 		cmd = MPSIO_MPS_COMMAND;
 		PTRIN_CP(*user32, arg.user, req);
 		CP(*user32, arg.user, req_len);
 		PTRIN_CP(*user32, arg.user, rpl);
 		CP(*user32, arg.user, rpl_len);
 		PTRIN_CP(*user32, arg.user, buf);
 		CP(*user32, arg.user, len);
 		CP(*user32, arg.user, flags);
 		break;
 	default:
 		return (ENOIOCTL);
 	}
 
 	error = mps_ioctl(dev, cmd, &arg, flag, td);
 	if (error == 0 && (cmd32 & IOC_OUT) != 0) {
 		switch (cmd32) {
 		case MPSIO_READ_CFG_HEADER32:
 		case MPSIO_READ_CFG_PAGE32:
 		case MPSIO_WRITE_CFG_PAGE32:
 			CP(arg.page, *page32, header);
 			CP(arg.page, *page32, page_address);
 			PTROUT_CP(arg.page, *page32, buf);
 			CP(arg.page, *page32, len);
 			CP(arg.page, *page32, ioc_status);
 			break;
 
 		case MPSIO_READ_EXT_CFG_HEADER32:
 		case MPSIO_READ_EXT_CFG_PAGE32:
 			CP(arg.ext, *ext32, header);
 			CP(arg.ext, *ext32, page_address);
 			PTROUT_CP(arg.ext, *ext32, buf);
 			CP(arg.ext, *ext32, len);
 			CP(arg.ext, *ext32, ioc_status);
 			break;
 
 		case MPSIO_RAID_ACTION32:
 			CP(arg.raid, *raid32, action);
 			CP(arg.raid, *raid32, volume_bus);
 			CP(arg.raid, *raid32, volume_id);
 			CP(arg.raid, *raid32, phys_disk_num);
 			CP(arg.raid, *raid32, action_data_word);
 			PTROUT_CP(arg.raid, *raid32, buf);
 			CP(arg.raid, *raid32, len);
 			CP(arg.raid, *raid32, volume_status);
 			bcopy(arg.raid.action_data, raid32->action_data,
 			    sizeof arg.raid.action_data);
 			CP(arg.raid, *raid32, ioc_status);
 			CP(arg.raid, *raid32, write);
 			break;
 
 		case MPSIO_MPS_COMMAND32:
 			PTROUT_CP(arg.user, *user32, req);
 			CP(arg.user, *user32, req_len);
 			PTROUT_CP(arg.user, *user32, rpl);
 			CP(arg.user, *user32, rpl_len);
 			PTROUT_CP(arg.user, *user32, buf);
 			CP(arg.user, *user32, len);
 			CP(arg.user, *user32, flags);
 			break;
 		}
 	}
 
 	return (error);
 }
 #endif /* COMPAT_FREEBSD32 */
 
 static int
 mps_ioctl_devsw(struct cdev *dev, u_long com, caddr_t arg, int flag,
     struct thread *td)
 {
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return (mps_ioctl32(dev, com, arg, flag, td));
 #endif
 	return (mps_ioctl(dev, com, arg, flag, td));
 }
Index: projects/building-blocks/sys/kern/kern_timeout.c
===================================================================
--- projects/building-blocks/sys/kern/kern_timeout.c	(revision 277723)
+++ projects/building-blocks/sys/kern/kern_timeout.c	(revision 277724)
@@ -1,1475 +1,1479 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_callout_profiling.h"
 #if defined(__arm__)
 #include "opt_timer.h"
 #endif
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/file.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #ifdef SMP
 #include <machine/cpu.h>
 #endif
 
 #ifndef NO_EVENTTIMERS
 DPCPU_DECLARE(sbintime_t, hardclocktime);
 #endif
 
 SDT_PROVIDER_DEFINE(callout_execute);
 SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__start,
     "struct callout *");
 SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__end,
     "struct callout *");
 
 #ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
 static int avg_gcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
     "Average number of Giant callouts made per softclock call. Units = 1/1000");
 static int avg_lockcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
     "Average number of lock callouts made per softclock call. Units = 1/1000");
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
 static int avg_depth_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
     "Average number of direct callouts examined per callout_process call. "
     "Units = 1/1000");
 static int avg_lockcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
     "callout_process call. Units = 1/1000");
 static int avg_mpcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
     0, "Average number of MP direct callouts made per callout_process call. "
     "Units = 1/1000");
 #endif
 
 static int ncallout;
 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
     "Number of entries in callwheel and size of timeout() preallocation");
 
 #ifdef	RSS
 static int pin_default_swi = 1;
 static int pin_pcpu_swi = 1;
 #else
 static int pin_default_swi = 0;
 static int pin_pcpu_swi = 0;
 #endif
 
 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
 
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
  */
 u_int callwheelsize, callwheelmask;
 
 /*
  * The callout cpu exec entities represent informations necessary for
  * describing the state of callouts currently running on the CPU and the ones
  * necessary for migrating callouts to the new callout cpu. In particular,
  * the first entry of the array cc_exec_entity holds informations for callout
  * running in SWI thread context, while the second one holds informations
  * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
 struct cc_exec {
 	struct callout		*cc_next;
 	struct callout		*cc_curr;
 #ifdef SMP
 	void			(*ce_migration_func)(void *);
 	void			*ce_migration_arg;
 	int			ce_migration_cpu;
 	sbintime_t		ce_migration_time;
 	sbintime_t		ce_migration_prec;
 #endif
 	bool			cc_cancel;
 	bool			cc_waiting;
 };
 
 /*
  * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
 	struct callout_slist	cc_callfree;
 	sbintime_t		cc_firstevent;
 	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
 	u_int			cc_bucket;
 	char			cc_ktr_event_name[20];
 };
 
 #define	cc_exec_curr		cc_exec_entity[0].cc_curr
 #define	cc_exec_next		cc_exec_entity[0].cc_next
 #define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
 #define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
 #define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
 #define	cc_exec_next_dir	cc_exec_entity[1].cc_next
 #define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
 #define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
 
 #ifdef SMP
 #define	cc_migration_func	cc_exec_entity[0].ce_migration_func
 #define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
 #define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
 #define	cc_migration_time	cc_exec_entity[0].ce_migration_time
 #define	cc_migration_prec	cc_exec_entity[0].ce_migration_prec
 #define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
 #define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
 #define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
 #define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
 #define	cc_migration_prec_dir	cc_exec_entity[1].ce_migration_prec
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
 #define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
 #else
 struct callout_cpu cc_cpu;
 #define	CC_CPU(cpu)	&cc_cpu
 #define	CC_SELF()	&cc_cpu
 #endif
 #define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
 static int timeout_cpu;
 
 static void	callout_cpu_init(struct callout_cpu *cc, int cpu);
 static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
 		    int *mpcalls, int *lockcalls, int *gcalls,
 #endif
 		    int direct);
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
  *   cc_curr         - If a callout is in progress, it is cc_curr.
  *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
  *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
  *                     cc_curr is non-NULL.
  */
 
 /*
  * Resets the execution entity tied to a specific callout cpu.
  */
 static void
 cc_cce_cleanup(struct callout_cpu *cc, int direct)
 {
 
 	cc->cc_exec_entity[direct].cc_curr = NULL;
 	cc->cc_exec_entity[direct].cc_next = NULL;
 	cc->cc_exec_entity[direct].cc_cancel = false;
 	cc->cc_exec_entity[direct].cc_waiting = false;
 #ifdef SMP
 	cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
 	cc->cc_exec_entity[direct].ce_migration_time = 0;
 	cc->cc_exec_entity[direct].ce_migration_prec = 0;
 	cc->cc_exec_entity[direct].ce_migration_func = NULL;
 	cc->cc_exec_entity[direct].ce_migration_arg = NULL;
 #endif
 }
 
 /*
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
 cc_cce_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
 	return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Kernel low level callwheel initialization
  * called on cpu0 during kernel startup.
  */
 static void
 callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Calculate the size of the callout wheel and the preallocated
 	 * timeout() structures.
 	 * XXX: Clip callout to result of previous function of maxusers
 	 * maximum 384.  This is still huge, but acceptable.
 	 */
 	ncallout = imin(16 + maxproc + maxfiles, 18508);
 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
 
 	/*
 	 * Calculate callout wheel size, should be next power of two higher
 	 * than 'ncallout'.
 	 */
 	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Fetch whether we're pinning the swi's or not.
 	 */
 	TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
 	TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
 
 	/*
 	 * Only cpu0 handles timeout(9) and receives a preallocation.
 	 *
 	 * XXX: Once all timeout(9) consumers are converted this can
 	 * be removed.
 	 */
 	timeout_cpu = PCPU_GET(cpuid);
 	cc = CC_CPU(timeout_cpu);
 	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
 	    M_CALLOUT, M_WAITOK);
 	callout_cpu_init(cc, timeout_cpu);
 }
 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
 
 /*
  * Initialize the per-cpu callout structures.
  */
 static void
 callout_cpu_init(struct callout_cpu *cc, int cpu)
 {
 	struct callout *c;
 	int i;
 
 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
 	SLIST_INIT(&cc->cc_callfree);
 	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
 	    M_CALLOUT, M_WAITOK);
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = SBT_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
 	    "callwheel cpu %d", cpu);
 	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
 		return;
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
 		c->c_flags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
 #ifdef SMP
 /*
  * Switches the cpu tied to a specific callout.
  * The function expects a locked incoming callout cpu and returns with
  * locked outcoming callout cpu.
  */
 static struct callout_cpu *
 callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
 {
 	struct callout_cpu *new_cc;
 
 	MPASS(c != NULL && cc != NULL);
 	CC_LOCK_ASSERT(cc);
 
 	/*
 	 * Avoid interrupts and preemption firing after the callout cpu
 	 * is blocked in order to avoid deadlocks as the new thread
 	 * may be willing to acquire the callout cpu lock.
 	 */
 	c->c_cpu = CPUBLOCK;
 	spinlock_enter();
 	CC_UNLOCK(cc);
 	new_cc = CC_CPU(new_cpu);
 	CC_LOCK(new_cc);
 	spinlock_exit();
 	c->c_cpu = new_cpu;
 	return (new_cc);
 }
 #endif
 
 /*
  * Start standard softclock thread.
  */
 static void
 start_softclock(void *dummy)
 {
 	struct callout_cpu *cc;
 	char name[MAXCOMLEN];
 #ifdef SMP
 	int cpu;
 	struct intr_event *ie;
 #endif
 
 	cc = CC_CPU(timeout_cpu);
 	snprintf(name, sizeof(name), "clock (%d)", timeout_cpu);
 	if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK,
 	    INTR_MPSAFE, &cc->cc_cookie))
 		panic("died while creating standard software ithreads");
 	if (pin_default_swi &&
 	    (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) {
 		printf("%s: timeout clock couldn't be pinned to cpu %d\n",
 		    __func__,
 		    timeout_cpu);
 	}
 
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		if (cpu == timeout_cpu)
 			continue;
 		cc = CC_CPU(cpu);
 		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(9). */
 		callout_cpu_init(cc, cpu);
 		snprintf(name, sizeof(name), "clock (%d)", cpu);
 		ie = NULL;
 		if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
 		if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) {
 			printf("%s: per-cpu clock couldn't be pinned to "
 			    "cpu %d\n",
 			    __func__,
 			    cpu);
 		}
 	}
 #endif
 }
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
 #define	CC_HASH_SHIFT	8
 
 static inline u_int
 callout_hash(sbintime_t sbt)
 {
 
 	return (sbt >> (32 - CC_HASH_SHIFT));
 }
 
 static inline u_int
 callout_get_bucket(sbintime_t sbt)
 {
 
 	return (callout_hash(sbt) & callwheelmask);
 }
 
 void
 callout_process(sbintime_t now)
 {
 	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t first, last, max, tmp_max;
 	uint32_t lookahead;
 	u_int firstb, lastb, nowb;
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
 
 	cc = CC_SELF();
 	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
 	cc->cc_lastscan = now;
 	nowb = callout_hash(now);
 
 	/* Compute the last bucket and minimum time of the bucket after it. */
 	if (nowb == firstb)
 		lookahead = (SBT_1S / 16);
 	else if (nowb - firstb == 1)
 		lookahead = (SBT_1S / 8);
 	else
 		lookahead = (SBT_1S / 2);
 	first = last = now;
 	first += (lookahead / 2);
 	last += lookahead;
 	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
 	lastb = callout_hash(last) - 1;
 	max = last;
 
 	/*
 	 * Check if we wrapped around the entire wheel from the last scan.
 	 * In case, we need to scan entirely the wheel for pending callouts.
 	 */
 	if (lastb - firstb >= callwheelsize) {
 		lastb = firstb + callwheelsize - 1;
 		if (nowb - firstb >= callwheelsize)
 			nowb = lastb;
 	}
 
 	/* Iterate callwheel from firstb to nowb and then up to lastb. */
 	do {
 		sc = &cc->cc_callwheel[firstb & callwheelmask];
 		tmp = LIST_FIRST(sc);
 		while (tmp != NULL) {
 			/* Run the callout if present time within allowed. */
 			if (tmp->c_time <= now) {
 				/*
 				 * Consumer told us the callout may be run
 				 * directly from hardware interrupt context.
 				 */
 				if (tmp->c_flags & CALLOUT_DIRECT) {
 #ifdef CALLOUT_PROFILING
 					++depth_dir;
 #endif
 					cc->cc_exec_next_dir =
 					    LIST_NEXT(tmp, c_links.le);
 					cc->cc_bucket = firstb & callwheelmask;
 					LIST_REMOVE(tmp, c_links.le);
 					softclock_call_cc(tmp, cc,
 #ifdef CALLOUT_PROFILING
 					    &mpcalls_dir, &lockcalls_dir, NULL,
 #endif
 					    1);
 					tmp = cc->cc_exec_next_dir;
 				} else {
 					tmpn = LIST_NEXT(tmp, c_links.le);
 					LIST_REMOVE(tmp, c_links.le);
 					TAILQ_INSERT_TAIL(&cc->cc_expireq,
 					    tmp, c_links.tqe);
 					tmp->c_flags |= CALLOUT_PROCESSED;
 					tmp = tmpn;
 				}
 				continue;
 			}
 			/* Skip events from distant future. */
 			if (tmp->c_time >= max)
 				goto next;
 			/*
 			 * Event minimal time is bigger than present maximal
 			 * time, so it cannot be aggregated.
 			 */
 			if (tmp->c_time > last) {
 				lastb = nowb;
 				goto next;
 			}
 			/* Update first and last time, respecting this event. */
 			if (tmp->c_time < first)
 				first = tmp->c_time;
 			tmp_max = tmp->c_time + tmp->c_precision;
 			if (tmp_max < last)
 				last = tmp_max;
 next:
 			tmp = LIST_NEXT(tmp, c_links.le);
 		}
 		/* Proceed with the next bucket. */
 		firstb++;
 		/*
 		 * Stop if we looked after present time and found
 		 * some event we can't execute at now.
 		 * Stop if we looked far enough into the future.
 		 */
 	} while (((int)(firstb - lastb)) <= 0);
 	cc->cc_firstevent = last;
 #ifndef NO_EVENTTIMERS
 	cpu_new_callout(curcpu, last, first);
 #endif
 #ifdef CALLOUT_PROFILING
 	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
 	 */
 	if (!TAILQ_EMPTY(&cc->cc_expireq))
 		swi_sched(cc->cc_cookie, 0);
 }
 
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
 	int cpu;
 
 	for (;;) {
 		cpu = c->c_cpu;
 #ifdef SMP
 		if (cpu == CPUBLOCK) {
 			while (c->c_cpu == CPUBLOCK)
 				cpu_spinwait();
 			continue;
 		}
 #endif
 		cc = CC_CPU(cpu);
 		CC_LOCK(cc);
 		if (cpu == c->c_cpu)
 			break;
 		CC_UNLOCK(cc);
 	}
 	return (cc);
 }
 
 static void
 callout_cc_add(struct callout *c, struct callout_cpu *cc,
     sbintime_t sbt, sbintime_t precision, void (*func)(void *),
     void *arg, int cpu, int flags)
 {
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
 	if (sbt < cc->cc_lastscan)
 		sbt = cc->cc_lastscan;
 	c->c_arg = arg;
 	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
 	if (flags & C_DIRECT_EXEC)
 		c->c_flags |= CALLOUT_DIRECT;
 	c->c_flags &= ~CALLOUT_PROCESSED;
 	c->c_func = func;
 	c->c_time = sbt;
 	c->c_precision = precision;
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
 	if (cc->cc_bucket == bucket)
 		cc->cc_exec_next_dir = c;
 #ifndef NO_EVENTTIMERS
 	/*
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
 	if (SBT_MAX - c->c_time < c->c_precision)
 		c->c_precision = SBT_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
 		cpu_new_callout(cpu, sbt, c->c_time);
 	}
 #endif
 }
 
 static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
 	if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0)
 		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
 
 static void
 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
     int *mpcalls, int *lockcalls, int *gcalls,
 #endif
     int direct)
 {
 	struct rm_priotracker tracker;
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
 	uintptr_t lock_status;
 	int c_flags;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
 	int flags, new_cpu;
 	sbintime_t new_prec, new_time;
 #endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
 	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
 
 	KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
 	    (CALLOUT_PENDING | CALLOUT_ACTIVE),
 	    ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
 	lock_status = 0;
 	if (c->c_flags & CALLOUT_SHAREDLOCK) {
 		if (class == &lock_class_rm)
 			lock_status = (uintptr_t)&tracker;
 		else
 			lock_status = 1;
 	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_flags = c->c_flags;
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
 		c->c_flags = CALLOUT_LOCAL_ALLOC;
 	else
 		c->c_flags &= ~CALLOUT_PENDING;
 	cc->cc_exec_entity[direct].cc_curr = c;
 	cc->cc_exec_entity[direct].cc_cancel = false;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
 		class->lc_lock(c_lock, lock_status);
 		/*
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
 		if (cc->cc_exec_entity[direct].cc_cancel) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
 		cc->cc_exec_entity[direct].cc_cancel = true;
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
 			    c, c_func, c_arg);
 		} else {
 #ifdef CALLOUT_PROFILING
 			(*lockcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
 	} else {
 #ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 #endif
 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
 	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
 	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt1 = sbinuptime();
 #endif
 	THREAD_NO_SLEEPING();
 	SDT_PROBE(callout_execute, kernel, , callout__start, c, 0, 0, 0, 0);
 	c_func(c_arg);
 	SDT_PROBE(callout_execute, kernel, , callout__end, c, 0, 0, 0, 0);
 	THREAD_SLEEPING_OK();
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt2 = sbinuptime();
 	sbt2 -= sbt1;
 	if (sbt2 > maxdt) {
 		if (lastfunc != c_func || sbt2 > maxdt * 2) {
 			ts2 = sbttots(sbt2);
 			printf(
 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
 		}
 		maxdt = sbt2;
 		lastfunc = c_func;
 	}
 #endif
 	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
 	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
 	KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
 	cc->cc_exec_entity[direct].cc_curr = NULL;
 	if (cc->cc_exec_entity[direct].cc_waiting) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
 		if (cc_cce_migrating(cc, direct)) {
 			cc_cce_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
 			 * destroyed but that is not easy.
 			 */
 			c->c_flags &= ~CALLOUT_DFRMIGRATION;
 		}
 		cc->cc_exec_entity[direct].cc_waiting = false;
 		CC_UNLOCK(cc);
 		wakeup(&cc->cc_exec_entity[direct].cc_waiting);
 		CC_LOCK(cc);
 	} else if (cc_cce_migrating(cc, direct)) {
 		KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
 		/*
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
 		new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu;
 		new_time = cc->cc_exec_entity[direct].ce_migration_time;
 		new_prec = cc->cc_exec_entity[direct].ce_migration_prec;
 		new_func = cc->cc_exec_entity[direct].ce_migration_func;
 		new_arg = cc->cc_exec_entity[direct].ce_migration_arg;
 		cc_cce_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
 		 * but that is not easy.
 		 *
 		 * As first thing, handle deferred callout stops.
 		 */
 		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
 			CTR3(KTR_CALLOUT,
 			     "deferred cancelled %p func %p arg %p",
 			     c, new_func, new_arg);
 			callout_cc_del(c, cc);
 			return;
 		}
 		c->c_flags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
 		flags = (direct) ? C_DIRECT_EXEC : 0;
 		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
 		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
 		panic("migration should not happen");
 #endif
 	}
 	/*
 	 * If the current callout is locally allocated (from
 	 * timeout(9)) then put it on the freelist.
 	 *
 	 * Note: we need to check the cached copy of c_flags because
 	 * if it was not local, then it's not safe to deref the
 	 * callout pointer.
 	 */
 	KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 ||
 	    c->c_flags == CALLOUT_LOCAL_ALLOC,
 	    ("corrupted callout"));
 	if (c_flags & CALLOUT_LOCAL_ALLOC)
 		callout_cc_del(c, cc);
 }
 
 /*
  * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
  * used in this implementation was published by G. Varghese and T. Lauck in
  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
  * the Efficient Implementation of a Timer Facility" in the Proceedings of
  * the 11th ACM Annual Symposium on Operating Systems Principles,
  * Austin, Texas Nov 1987.
  */
 
 /*
  * Software (low priority) clock interrupt.
  * Run periodic events from timeout queue.
  */
 void
 softclock(void *arg)
 {
 	struct callout_cpu *cc;
 	struct callout *c;
 #ifdef CALLOUT_PROFILING
 	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
 #endif
 
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
 	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		softclock_call_cc(c, cc,
 #ifdef CALLOUT_PROFILING
 		    &mpcalls, &lockcalls, &gcalls,
 #endif
 		    0);
 #ifdef CALLOUT_PROFILING
 		++depth;
 #endif
 	}
 #ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
 #endif
 	CC_UNLOCK(cc);
 }
 
 /*
  * timeout --
  *	Execute a function after a specified length of time.
  *
  * untimeout --
  *	Cancel previous timeout function call.
  *
  * callout_handle_init --
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
  *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
  */
 struct callout_handle
 timeout(timeout_t *ftn, void *arg, int to_ticks)
 {
 	struct callout_cpu *cc;
 	struct callout *new;
 	struct callout_handle handle;
 
 	cc = CC_CPU(timeout_cpu);
 	CC_LOCK(cc);
 	/* Fill in the next free callout structure. */
 	new = SLIST_FIRST(&cc->cc_callfree);
 	if (new == NULL)
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
 	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
 	return (handle);
 }
 
 void
 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Check for a handle that was initialized
 	 * by callout_handle_init, but never used
 	 * for a real timeout.
 	 */
 	if (handle.callout == NULL)
 		return;
 
 	cc = callout_lock(handle.callout);
 	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
 		callout_stop(handle.callout);
 	CC_UNLOCK(cc);
 }
 
 void
 callout_handle_init(struct callout_handle *handle)
 {
 	handle->callout = NULL;
 }
 
 /*
  * New interface; clients allocate their own callout structures.
  *
  * callout_reset() - establish or change a timeout
  * callout_stop() - disestablish a timeout
  * callout_init() - initialize a callout structure so that it can
  *	safely be passed to callout_reset() and callout_stop()
  *
  * <sys/callout.h> defines three convenience macros:
  *
  * callout_active() - returns truth if callout has not been stopped,
  *	drained, or deactivated since the last time the callout was
  *	reset.
  * callout_pending() - returns truth if callout is still waiting for timeout
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
 	sbintime_t to_sbt, pr;
 	struct callout_cpu *cc;
 	int cancelled, direct;
 
 	cancelled = 0;
 	if (flags & C_ABSOLUTE) {
 		to_sbt = sbt;
 	} else {
 		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
 			sbt = tick_sbt;
 		if ((flags & C_HARDCLOCK) ||
 #ifdef NO_EVENTTIMERS
 		    sbt >= sbt_timethreshold) {
 			to_sbt = getsbinuptime();
 
 			/* Add safety belt for the case of hz > 1000. */
 			to_sbt += tc_tick_sbt - tick_sbt;
 #else
 		    sbt >= sbt_tickthreshold) {
 			/*
 			 * Obtain the time of the last hardclock() call on
 			 * this CPU directly from the kern_clocksource.c.
 			 * This value is per-CPU, but it is equal for all
 			 * active ones.
 			 */
 #ifdef __LP64__
 			to_sbt = DPCPU_GET(hardclocktime);
 #else
 			spinlock_enter();
 			to_sbt = DPCPU_GET(hardclocktime);
 			spinlock_exit();
 #endif
 #endif
 			if ((flags & C_HARDCLOCK) == 0)
 				to_sbt += tick_sbt;
 		} else
 			to_sbt = sbinuptime();
 		if (SBT_MAX - to_sbt < sbt)
 			to_sbt = SBT_MAX;
 		else
 			to_sbt += sbt;
 		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
 		    sbt >> C_PRELGET(flags));
 		if (pr > precision)
 			precision = pr;
 	}
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
 	 * become unbalanced.
 	 */
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
 		cpu = c->c_cpu;
 	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
 	KASSERT(!direct || c->c_lock == NULL,
 	    ("%s: direct callout %p has lock", __func__, c));
 	cc = callout_lock(c);
 	if (cc->cc_exec_entity[direct].cc_curr == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
 		if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel)
 			cancelled = cc->cc_exec_entity[direct].cc_cancel = true;
 		if (cc->cc_exec_entity[direct].cc_waiting) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
 			 */
 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
 			    cancelled ? "cancelled" : "failed to cancel",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 	}
 	if (c->c_flags & CALLOUT_PENDING) {
 		if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
 			if (cc->cc_exec_next_dir == c)
 				cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		cancelled = 1;
 		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 	}
 
 #ifdef SMP
 	/*
 	 * If the callout must migrate try to perform it immediately.
 	 * If the callout is currently running, just defer the migration
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
 		if (cc->cc_exec_entity[direct].cc_curr == c) {
 			cc->cc_exec_entity[direct].ce_migration_cpu = cpu;
 			cc->cc_exec_entity[direct].ce_migration_time
 			    = to_sbt;
 			cc->cc_exec_entity[direct].ce_migration_prec 
 			    = precision;
 			cc->cc_exec_entity[direct].ce_migration_func = ftn;
 			cc->cc_exec_entity[direct].ce_migration_arg = arg;
 			c->c_flags |= CALLOUT_DFRMIGRATION;
 			CTR6(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
 			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 			    (u_int)(to_sbt & 0xffffffff), cpu);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 		cc = callout_cpu_switch(c, cc, cpu);
 	}
 #endif
 
 	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
 	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
 	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 	    (u_int)(to_sbt & 0xffffffff));
 	CC_UNLOCK(cc);
 
 	return (cancelled);
 }
 
 /*
  * Common idioms that can be optimized in the future.
  */
 int
 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
 }
 
 int
 callout_schedule(struct callout *c, int to_ticks)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
 }
 
 int
 _callout_stop_safe(struct callout *c, int safe)
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
 	int direct, sq_locked, use_lock;
 
+	if (safe)
+		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock,
+		    "calling %s", __func__);
+
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
 	if (!safe && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
 			use_lock = 1;
 			class = LOCK_CLASS(c->c_lock);
 			class->lc_assert(c->c_lock, LA_XLOCKED);
 		}
 	} else
 		use_lock = 0;
 	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
 	sq_locked = 0;
 	old_cc = NULL;
 again:
 	cc = callout_lock(c);
 
 	/*
 	 * If the callout was migrating while the callout cpu lock was
 	 * dropped,  just drop the sleepqueue lock and check the states
 	 * again.
 	 */
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
 		sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting);
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
 #else
 		panic("migration should not happen");
 #endif
 	}
 
 	/*
 	 * If the callout isn't pending, it's not on the queue, so
 	 * don't attempt to remove it from the queue.  We can try to
 	 * stop it by other means however.
 	 */
 	if (!(c->c_flags & CALLOUT_PENDING)) {
 		c->c_flags &= ~CALLOUT_ACTIVE;
 
 		/*
 		 * If it wasn't on the queue and it isn't the current
 		 * callout, then we can't stop it, so just bail.
 		 */
 		if (cc->cc_exec_entity[direct].cc_curr != c) {
 			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			if (sq_locked)
 				sleepq_release(
 				    &cc->cc_exec_entity[direct].cc_waiting);
 			return (0);
 		}
 
 		if (safe) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
 			 * just wait for the current invocation to
 			 * finish.
 			 */
 			while (cc->cc_exec_entity[direct].cc_curr == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
 				 * a LOR between cc_lock and sleepqueue
 				 * chain spinlocks.  This piece of code
 				 * emulates a msleep_spin() call actually.
 				 *
 				 * If we already have the sleepqueue chain
 				 * locked, then we can safely block.  If we
 				 * don't already have it locked, however,
 				 * we have to drop the cc_lock to lock
 				 * it.  This opens several races, so we
 				 * restart at the beginning once we have
 				 * both locks.  If nothing has changed, then
 				 * we will end up back here with sq_locked
 				 * set.
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
 					sleepq_lock(
 					&cc->cc_exec_entity[direct].cc_waiting);
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
 				}
 
 				/*
 				 * Migration could be cancelled here, but
 				 * as long as it is still not sure when it
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
 				cc->cc_exec_entity[direct].cc_waiting = true;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
 				sleepq_add(
 				    &cc->cc_exec_entity[direct].cc_waiting,
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
 				sleepq_wait(
 				    &cc->cc_exec_entity[direct].cc_waiting,
 					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
 				/* Reacquire locks previously released. */
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
 		} else if (use_lock &&
 			    !cc->cc_exec_entity[direct].cc_cancel) {
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
 			 * and return.  After our caller drops the
 			 * lock, the callout will be skipped in
 			 * softclock().
 			 */
 			cc->cc_exec_entity[direct].cc_cancel = true;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			KASSERT(!cc_cce_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
 		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
 			c->c_flags &= ~CALLOUT_DFRMIGRATION;
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (1);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		CC_UNLOCK(cc);
 		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
 		return (0);
 	}
 	if (sq_locked)
 		sleepq_release(&cc->cc_exec_entity[direct].cc_waiting);
 
 	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
 	if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
 		if (cc->cc_exec_next_dir == c)
 			cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
 		LIST_REMOVE(c, c_links.le);
 	} else
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 	callout_cc_del(c, cc);
 
 	CC_UNLOCK(cc);
 	return (1);
 }
 
 void
 callout_init(struct callout *c, int mpsafe)
 {
 	bzero(c, sizeof *c);
 	if (mpsafe) {
 		c->c_lock = NULL;
 		c->c_flags = CALLOUT_RETURNUNLOCKED;
 	} else {
 		c->c_lock = &Giant.lock_object;
 		c->c_flags = 0;
 	}
 	c->c_cpu = timeout_cpu;
 }
 
 void
 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
 {
 	bzero(c, sizeof *c);
 	c->c_lock = lock;
 	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
 	    ("callout_init_lock: bad flags %d", flags));
 	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
 	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
 	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
 	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
 	    __func__));
 	c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
 #ifdef APM_FIXUP_CALLTODO
 /* 
  * Adjust the kernel calltodo timeout list.  This routine is used after 
  * an APM resume to recalculate the calltodo timer list values with the 
  * number of hz's we have been sleeping.  The next hardclock() will detect 
  * that there are fired timers and run softclock() to execute them.
  *
  * Please note, I have not done an exhaustive analysis of what code this
  * might break.  I am motivated to have my select()'s and alarm()'s that
  * have expired during suspend firing upon resume so that the applications
  * which set the timer can do the maintanence the timer was for as close
  * as possible to the originally intended time.  Testing this code for a 
  * week showed that resuming from a suspend resulted in 22 to 25 timers 
  * firing, which seemed independant on whether the suspend was 2 hours or
  * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
  */
 void
 adjust_timeout_calltodo(struct timeval *time_change)
 {
 	register struct callout *p;
 	unsigned long delta_ticks;
 
 	/* 
 	 * How many ticks were we asleep?
 	 * (stolen from tvtohz()).
 	 */
 
 	/* Don't do anything */
 	if (time_change->tv_sec < 0)
 		return;
 	else if (time_change->tv_sec <= LONG_MAX / 1000000)
 		delta_ticks = (time_change->tv_sec * 1000000 +
 			       time_change->tv_usec + (tick - 1)) / tick + 1;
 	else if (time_change->tv_sec <= LONG_MAX / hz)
 		delta_ticks = time_change->tv_sec * hz +
 			      (time_change->tv_usec + (tick - 1)) / tick + 1;
 	else
 		delta_ticks = LONG_MAX;
 
 	if (delta_ticks > INT_MAX)
 		delta_ticks = INT_MAX;
 
 	/* 
 	 * Now rip through the timer calltodo list looking for timers
 	 * to expire.
 	 */
 
 	/* don't collide with softclock() */
 	CC_LOCK(cc);
 	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
 		p->c_time -= delta_ticks;
 
 		/* Break if the timer had more time on it than delta_ticks */
 		if (p->c_time > 0)
 			break;
 
 		/* take back the ticks the timer didn't use (p->c_time <= 0) */
 		delta_ticks = -p->c_time;
 	}
 	CC_UNLOCK(cc);
 
 	return;
 }
 #endif /* APM_FIXUP_CALLTODO */
 
 static int
 flssbt(sbintime_t sbt)
 {
 
 	sbt += (uint64_t)sbt >> 1;
 	if (sizeof(long) >= sizeof(sbintime_t))
 		return (flsl(sbt));
 	if (sbt >= SBT_1S)
 		return (flsl(((uint64_t)sbt) >> 32) + 32);
 	return (flsl(sbt));
 }
 
 /*
  * Dump immediate statistic snapshot of the scheduled callouts.
  */
 static int
 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
 {
 	struct callout *tmp;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
 	int ct[64], cpr[64], ccpbk[32];
 	int error, val, i, count, tcum, pcum, maxc, c, medc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	count = maxc = 0;
 	st = spr = maxt = maxpr = 0;
 	bzero(ccpbk, sizeof(ccpbk));
 	bzero(ct, sizeof(ct));
 	bzero(cpr, sizeof(cpr));
 	now = sbinuptime();
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		cc = CC_CPU(cpu);
 #else
 		cc = CC_CPU(timeout_cpu);
 #endif
 		CC_LOCK(cc);
 		for (i = 0; i < callwheelsize; i++) {
 			sc = &cc->cc_callwheel[i];
 			c = 0;
 			LIST_FOREACH(tmp, sc, c_links.le) {
 				c++;
 				t = tmp->c_time - now;
 				if (t < 0)
 					t = 0;
 				st += t / SBT_1US;
 				spr += tmp->c_precision / SBT_1US;
 				if (t > maxt)
 					maxt = t;
 				if (tmp->c_precision > maxpr)
 					maxpr = tmp->c_precision;
 				ct[flssbt(t)]++;
 				cpr[flssbt(tmp->c_precision)]++;
 			}
 			if (c > maxc)
 				maxc = c;
 			ccpbk[fls(c + c / 2)]++;
 			count += c;
 		}
 		CC_UNLOCK(cc);
 #ifdef SMP
 	}
 #endif
 
 	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
 		tcum += ct[i];
 	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
 		pcum += cpr[i];
 	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
 		c += ccpbk[i];
 	medc = (i >= 2) ? (1 << (i - 2)) : 0;
 
 	printf("Scheduled callouts statistic snapshot:\n");
 	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
 	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
 	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
 	    medc,
 	    count / callwheelsize / mp_ncpus,
 	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
 	    maxc);
 	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
 	    (st / count) / 1000000, (st / count) % 1000000,
 	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
 	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
 	    (spr / count) / 1000000, (spr / count) % 1000000,
 	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
 	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
 	    "   prec\t   pcum\n");
 	for (i = 0, tcum = pcum = 0; i < 64; i++) {
 		if (ct[i] == 0 && cpr[i] == 0)
 			continue;
 		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
 		tcum += ct[i];
 		pcum += cpr[i];
 		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
 		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
 		    i - 1 - (32 - CC_HASH_SHIFT),
 		    ct[i], tcum, cpr[i], pcum);
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_callout_stat, "I",
     "Dump immediate statistic snapshot of the scheduled callouts");
Index: projects/building-blocks/sys/kern/vfs_subr.c
===================================================================
--- projects/building-blocks/sys/kern/vfs_subr.c	(revision 277723)
+++ projects/building-blocks/sys/kern/vfs_subr.c	(revision 277724)
@@ -1,4878 +1,4878 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <machine/stdarg.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	v_incr_usecount(struct vnode *);
 static void	v_decr_usecount(struct vnode *);
 static void	v_decr_useonly(struct vnode *);
 static void	v_upgrade_usecount(struct vnode *);
 static void	vnlru_free(int);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static void	vfs_knl_assert_locked(void *arg);
 static void	vfs_knl_assert_unlocked(void *arg);
 static void	destroy_vpollinfo(struct vpollinfo *vi);
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
     "Number of vnodes in existence");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * Free vnode target.  Free vnodes may simply be files which have been stat'd
  * but not read.  This is somewhat common, and a small cache of such files
  * should be kept to avoid recreation costs.
  */
 static u_long wantfreevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 /* Number of vnodes in the free list. */
 static u_long freevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
     "Number of vnodes in the free list");
 
 static int vlru_allow_cache_src;
 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
     "Number of calls to reassignbuf");
 
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  */
 int	nfs_mount_type = -1;
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 static uma_zone_t buf_trie_zone;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 static struct cv sync_wakeup;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
     "Time to delay syncing files (in seconds)");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
     "Time to delay syncing directories (in seconds)");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
     "Time to delay syncing metadata (in seconds)");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
     "Number of times I/O speeded up (rush requests)");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  *
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
     &desiredvnodes, 0, "Maximum number of vnodes");
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 static int vnsz2log;
 
 /*
  * Support for the bufobj clean & dirty pctrie.
  */
 static void *
 buf_trie_alloc(struct pctrie *ptree)
 {
 
 	return uma_zalloc(buf_trie_zone, M_NOWAIT);
 }
 
 static void
 buf_trie_free(struct pctrie *ptree, void *node)
 {
 
 	uma_zfree(buf_trie_zone, node);
 }
 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
 
 /*
  * Initialize the vnode management data structures.
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
 #endif
 static void
 vntblinit(void *dummy __unused)
 {
 	u_int i;
 	int physvnodes, virtvnodes;
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
 	 * physical memory size.  The ratio of desiredvnodes to physical pages
 	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
 	 * marginal ratio of desiredvnodes to physical pages is one to
 	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
 	 * may not exceed one seventh of the kernel's heap size.
 	 */
 	physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4,
 	    vm_cnt.v_page_count) / 16;
 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode)));
 	desiredvnodes = min(physvnodes, virtvnodes);
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
 	 * Preallocate enough nodes to support one-per buf so that
 	 * we can not fail an insert.  reassignbuf() callers can not
 	 * tolerate the insertion failure.
 	 */
 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
 	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
 	uma_prealloc(buf_trie_zone, nbuf);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 	    &syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 	cv_init(&sync_wakeup, "syncer");
 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 		vnsz2log++;
 	vnsz2log--;
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Eventually, mountlist_mtx is not released on failure.
  *
  * vfs_busy() is a custom lock, it can block the caller.
  * vfs_busy() only sleeps if the unmount is active on the mount point.
  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
  * vnode belonging to mp.
  *
  * Lookup uses vfs_busy() to traverse mount points.
  * root fs			var fs
  * / vnode lock		A	/ vnode lock (/var)		D
  * /var vnode lock	B	/log vnode lock(/var/log)	E
  * vfs_busy lock	C	vfs_busy lock			F
  *
  * Within each file system, the lock order is C->A->B and F->D->E.
  *
  * When traversing across mounts, the system follows that lock order:
  *
  *        C->A->B
  *              |
  *              +->F->D->E
  *
  * The lookup() process for namei("/var") illustrates the process:
  *  VOP_LOOKUP() obtains B while A is held
  *  vfs_busy() obtains a shared lock on F while A and B are held
  *  vput() releases lock on B
  *  vput() releases lock on A
  *  VFS_ROOT() obtains lock on D while shared lock on F is held
  *  vfs_unbusy() releases shared lock on F
  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
  *    Attempt to lock A (instead of vp_crossmp) while D is held would
  *    violate the global order, causing deadlocks.
  *
  * dounmount() locks B while F is drained.
  */
 int
 vfs_busy(struct mount *mp, int flags)
 {
 
 	MPASS((flags & ~MBF_MASK) == 0);
 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	/*
 	 * If mount point is currenly being unmounted, sleep until the
 	 * mount point fate is decided.  If thread doing the unmounting fails,
 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 	 * that this mount point has survived the unmount attempt and vfs_busy
 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 	 * about to be really destroyed.  vfs_busy needs to release its
 	 * reference on the mount point in this case and return with ENOENT,
 	 * telling the caller that mount mount it tried to busy is no longer
 	 * valid.
 	 */
 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
 			    __func__);
 			return (ENOENT);
 		}
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_lock(&mountlist_mtx);
 		MNT_ILOCK(mp);
 	}
 	if (flags & MBF_MNTLSTLOCK)
 		mtx_unlock(&mountlist_mtx);
 	mp->mnt_lockref++;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
 	mp->mnt_lockref--;
 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
 		wakeup(&mp->mnt_lockref);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	return ((struct mount *) 0);
 }
 
 /*
  * Lookup a mount point by filesystem identifier, busying it before
  * returning.
  *
  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
  * cache for popular filesystem identifiers.  The cache is lockess, using
  * the fact that struct mount's are never freed.  In worst case we may
  * get pointer to unmounted or even different filesystem, so we have to
  * check what we got, and go slow way if so.
  */
 struct mount *
 vfs_busyfs(fsid_t *fsid)
 {
 #define	FSID_CACHE_SIZE	256
 	typedef struct mount * volatile vmp_t;
 	static vmp_t cache[FSID_CACHE_SIZE];
 	struct mount *mp;
 	int error;
 	uint32_t hash;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	hash = fsid->val[0] ^ fsid->val[1];
 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 	mp = cache[hash];
 	if (mp == NULL ||
 	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
 	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
 		goto slow;
 	if (vfs_busy(mp, 0) != 0) {
 		cache[hash] = NULL;
 		goto slow;
 	}
 	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
 		return (mp);
 	else
 	    vfs_unbusy(mp);
 
 slow:
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
 			if (error) {
 				cache[hash] = NULL;
 				mtx_unlock(&mountlist_mtx);
 				return (NULL);
 			}
 			cache[hash] = mp;
 			return (mp);
 		}
 	}
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	/*
 	 * If the thread is jailed, but this is not a jail-friendly file
 	 * system, deny immediately.
 	 */
 	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
 		return (EPERM);
 
 	/*
 	 * If the file system was mounted outside the jail of the calling
 	 * thread, deny immediately.
 	 */
 	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 		return (EPERM);
 
 	/*
 	 * If file system supports delegated administration, we don't check
 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 	 * by the file system itself.
 	 * If this is not the user that did original mount, we check for
 	 * the PRIV_VFS_MOUNT_OWNER privilege.
 	 */
 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static uint16_t mntid_base;
 	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
 		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
-static int timestamp_precision = TSP_SEC;
+static int timestamp_precision = TSP_USEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
     "3+: sec + ns (max. precision))");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp)
 {
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 	int count;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
 	trigger = vm_cnt.v_page_count * 2 / usevnodes;
 	done = 0;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize / 10 + 1;
 	while (count != 0) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 */
 		if (vp->v_usecount ||
 		    (!vlru_allow_cache_src &&
 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		MNT_IUNLOCK(mp);
 		vholdl(vp);
 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		VI_LOCK(vp);
 		/*
 		 * v_usecount may have been bumped after VOP_LOCK() dropped
 		 * the vnode interlock and before it was locked again.
 		 *
 		 * It is not necessary to recheck VI_DOOMED because it can
 		 * only be set by another thread that holds both the vnode
 		 * lock and vnode interlock.  If another thread has the
 		 * vnode lock before we get to VOP_LOCK() and obtains the
 		 * vnode interlock after VOP_LOCK() drops the vnode
 		 * interlock, the other thread will be unable to drop the
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount ||
 		    (!vlru_allow_cache_src &&
 			!LIST_EMPTY(&(vp)->v_cache_src)) ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK);
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 		vgonel(vp);
 		VOP_UNLOCK(vp, 0);
 		vdropl(vp);
 		done++;
 next_iter_mntunlocked:
 		if (!should_yield())
 			goto relock_mnt;
 		goto yield;
 next_iter:
 		if (!should_yield())
 			continue;
 		MNT_IUNLOCK(mp);
 yield:
 		kern_yield(PRI_USER);
 relock_mnt:
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	return done;
 }
 
 /*
  * Attempt to keep the free list at wantfreevnodes length.
  */
 static void
 vnlru_free(int count)
 {
 	struct vnode *vp;
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	for (; count > 0; count--) {
 		vp = TAILQ_FIRST(&vnode_free_list);
 		/*
 		 * The list can be modified while the free_list_mtx
 		 * has been dropped and vp could be NULL here.
 		 */
 		if (!vp)
 			break;
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vnlru_free: vnode already reclaimed."));
 		KASSERT((vp->v_iflag & VI_FREE) != 0,
 		    ("Removing vnode not on freelist"));
 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 		    ("Mangling active vnode"));
 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 		/*
 		 * Don't recycle if we can't get the interlock.
 		 */
 		if (!VI_TRYLOCK(vp)) {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
 			continue;
 		}
 		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
 		    vp, ("vp inconsistent on freelist"));
 
 		/*
 		 * The clear of VI_FREE prevents activation of the
 		 * vnode.  There is no sense in putting the vnode on
 		 * the mount point active list, only to remove it
 		 * later during recycling.  Inline the relevant part
 		 * of vholdl(), to avoid triggering assertions or
 		 * activating.
 		 */
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
 		vp->v_holdcnt++;
 
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		vtryrecycle(vp);
 		/*
 		 * If the recycled succeeded this vdrop will actually free
 		 * the vnode.  If not it will simply place it back on
 		 * the free list.
 		 */
 		vdrop(vp);
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int done;
 	struct proc *p = vnlruproc;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
 	for (;;) {
 		kproc_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
 		if (freevnodes > wantfreevnodes)
 			vnlru_free(freevnodes - wantfreevnodes);
 		if (numvnodes <= desiredvnodes * 9 / 10) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			done += vlrureclaim(mp);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
 				printf("vnlru process getting nowhere..\n");
 			else if (vnlru_nowhere == 5)
 				printf("vnlru process messages stopped.\n");
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
 			kern_yield(PRI_USER);
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
     &vnlru_kp);
  
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct mount *vnmp;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, vp %p lock is already held",
 		    __func__, vp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, cannot start the write for %p",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp, LK_INTERLOCK);
 		vn_finished_write(vnmp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, %p is already referenced",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0)
 		vgonel(vp);
 	VOP_UNLOCK(vp, LK_INTERLOCK);
 	vn_finished_write(vnmp);
 	return (0);
 }
 
 /*
  * Wait for available vnodes.
  */
 static int
 getnewvnode_wait(int suspended)
 {
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	if (numvnodes > desiredvnodes) {
 		if (suspended) {
 			/*
 			 * File system is beeing suspended, we cannot risk a
 			 * deadlock here, so allocate new vnode anyway.
 			 */
 			if (freevnodes > wantfreevnodes)
 				vnlru_free(freevnodes - wantfreevnodes);
 			return (0);
 		}
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 	}
 	return (numvnodes > desiredvnodes ? ENFILE : 0);
 }
 
 void
 getnewvnode_reserve(u_int count)
 {
 	struct thread *td;
 
 	td = curthread;
 	/* First try to be quick and racy. */
 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
 		td->td_vp_reserv += count;
 		return;
 	} else
 		atomic_subtract_long(&numvnodes, count);
 
 	mtx_lock(&vnode_free_list_mtx);
 	while (count > 0) {
 		if (getnewvnode_wait(0) == 0) {
 			count--;
 			td->td_vp_reserv++;
 			atomic_add_long(&numvnodes, 1);
 		}
 	}
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 void
 getnewvnode_drop_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
 	td->td_vp_reserv = 0;
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	struct thread *td;
 	int error;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
 	vp = NULL;
 	td = curthread;
 	if (td->td_vp_reserv > 0) {
 		td->td_vp_reserv -= 1;
 		goto alloc;
 	}
 	mtx_lock(&vnode_free_list_mtx);
 	/*
 	 * Lend our context to reclaim vnodes if they've exceeded the max.
 	 */
 	if (freevnodes > wantfreevnodes)
 		vnlru_free(1);
 	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
 	    MNTK_SUSPEND));
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 	if (error != 0) {
 		mtx_unlock(&vnode_free_list_mtx);
 		return (error);
 	}
 #endif
 	atomic_add_long(&numvnodes, 1);
 	mtx_unlock(&vnode_free_list_mtx);
 alloc:
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems
 	 * opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bo = &vp->v_bufobj;
 	bo->__bo_vnode = vp;
 	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
 	bo->bo_ops = &buf_ops_bio;
 	bo->bo_private = vp;
 	TAILQ_INIT(&bo->bo_clean.bv_hd);
 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
 	/*
 	 * Initialize namecache.
 	 */
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp);
 	vp->v_data = NULL;
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_vnode_associate_singlelabel(mp, vp);
 	else if (mp == NULL && vops != &dead_vnodeops)
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	if (mp != NULL) {
 		bo->bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
 	rangelock_init(&vp->v_rl);
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
 	 * still initialize v_hash to have vfs_hash_index() useful.
 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
 	 * its own hashing.
 	 */
 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 	int active;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
 	    ("Active vnode list size %d > Vnode list size %d",
 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
 	active = vp->v_iflag & VI_ACTIVE;
 	vp->v_iflag &= ~VI_ACTIVE;
 	if (active) {
 		mtx_lock(&vnode_free_list_mtx);
 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
 		mp->mnt_activevnodelistsize--;
 		mtx_unlock(&vnode_free_list_mtx);
 	}
 	vp->v_mount = NULL;
 	VI_UNLOCK(vp);
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 static void
 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
 {
 
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 int
 insmntque1(struct vnode *vp, struct mount *mp,
 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
 
 	/*
 	 * We acquire the vnode interlock early to ensure that the
 	 * vnode cannot be recycled by another process releasing a
 	 * holdcnt on it before we get it on both the vnode list
 	 * and the active vnode list. The mount mutex protects only
 	 * manipulation of the vnode list and the vnode freelist
 	 * mutex protects only manipulation of the active vnode list.
 	 * Hence the need to hold the vnode interlock throughout.
 	 */
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
 	    mp->mnt_nvnodelistsize == 0)) &&
 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
 		VI_UNLOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (dtr != NULL)
 			dtr(vp, dtr_arg);
 		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag |= VI_ACTIVE;
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
 	mp->mnt_activevnodelistsize++;
 	mtx_unlock(&vnode_free_list_mtx);
 	VI_UNLOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 int
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0 && !(flags & V_CLEANONLY))
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		BO_UNLOCK(bo);
 		if (bo->bo_object != NULL) {
 			VM_OBJECT_WLOCK(bo->bo_object);
 			vm_object_pip_wait(bo->bo_object, "bovlbx");
 			VM_OBJECT_WUNLOCK(bo->bo_object);
 		}
 		BO_LOCK(bo);
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL &&
 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
 		VM_OBJECT_WLOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
 		VM_OBJECT_WUNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
 {
 
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	if (vp->v_object != NULL && vp->v_object->handle != vp)
 		return (0);
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			continue;
 		}
 		lblkno = 0;
 		xflags = 0;
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
 			BUF_UNLOCK(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);
 		}
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp != NULL &&
 		    (nbp->b_bufobj != bo ||
 		     nbp->b_lblkno != lblkno ||
 		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
 	int trunclbn;
 	struct bufobj *bo;
 
 	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
 	    vp, cred, blksize, (uintmax_t)length);
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart;
 
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI))) {
 				BO_UNLOCK(bo);
 				goto restart;
 			}
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart;
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0)) {
 				BO_UNLOCK(bo);
 				goto restart;
 			}
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				goto restart;
 			}
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			BO_LOCK(bo);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	BO_UNLOCK(bo);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_WLOCKED(bp->b_bufobj);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
 	if (bp->b_xflags & BX_VNDIRTY)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct bufv *bv;
 	struct buf *n;
 	int error;
 
 	ASSERT_BO_WLOCKED(bo);
 	KASSERT((bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	/*
 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
 	 * we tend to grow at the tail so lookup_le should usually be cheaper
 	 * than _ge. 
 	 */
 	if (bv->bv_cnt == 0 ||
 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 	else
 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
 	if (error)
 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
 	bv->bv_cnt++;
 }
 
 /*
  * Lookup a buffer using the splay tree.  Note that we specifically avoid
  * shadow buffers used in background bitmap writes.
  *
  * This code isn't quite efficient as it could be because we are maintaining
  * two sorted lists and do not know which list the block resides in.
  *
  * During a "make buildworld" the desired buffer is found at one of
  * the roots more than 60% of the time.  Thus, checking both roots
  * before performing either splay eliminates unnecessary splays on the
  * first tree splayed.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	ASSERT_BO_WLOCKED(bo);
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = bo;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, bo, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("brelvp: Buffer %p not on queue.", bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	BO_UNLOCK(bo);
 	vdrop(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
 		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 
 static int
 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	*bo = LIST_FIRST(slp);
 	if (*bo == NULL)
 		return (0);
 	vp = (*bo)->__bo_vnode;	/* XXX */
 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (*bo == LIST_FIRST(slp));
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	BO_LOCK(*bo);
 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
 	BO_UNLOCK(*bo);
 	vdrop(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 static int first_printf = 1;
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next, *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	mtx_lock(&sync_mtx);
 	for (;;) {
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kproc_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while (!LIST_EMPTY(slp)) {
 			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 
 			if (first_printf == 0) {
 				/*
 				 * Drop the sync mutex, because some watchdog
 				 * drivers need to sleep while patting
 				 */
 				mtx_unlock(&sync_mtx);
 				wdog_kern_pat(WD_LASTVAL);
 				mtx_lock(&sync_mtx);
 			}
 
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING ||
 		    time_uptime == starttime) {
 			thread_lock(td);
 			sched_prio(td, PPAUSE);
 			thread_unlock(td);
 		}
 		if (syncer_state != SYNCER_RUNNING)
 			cv_timedwait(&sync_wakeup, &sync_mtx,
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	int ret = 0;
 
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 
 	if (howto & RB_NOSYNC)
 		return;
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_shutdown(arg, howto);
 }
 
 void
 syncer_suspend(void)
 {
 
 	syncer_shutdown(updateproc, 0);
 }
 
 void
 syncer_resume(void)
 {
 
 	mtx_lock(&sync_mtx);
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_resume(updateproc);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	++reassignbufcalls;
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("reassignbuf: Buffer %p not on queue.", bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	BO_UNLOCK(bo);
 }
 
 /*
  * Increment the use and hold counts on the vnode, taking care to reference
  * the driver's usecount if this is a chardev.  The vholdl() will remove
  * the vnode from the free list if it is presently free.  Requires the
  * vnode interlock and returns with it held.
  */
 static void
 v_incr_usecount(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vholdl(vp);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 }
 
 /*
  * Turn a holdcnt into a use+holdcnt such that only one call to
  * v_decr_usecount is needed.
  */
 static void
 v_upgrade_usecount(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 }
 
 /*
  * Decrement the vnode use and hold count along with the driver's usecount
  * if this is a chardev.  The vdropl() below releases the vnode interlock
  * as it may free the vnode.
  */
 static void
 v_decr_usecount(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_usecount: negative usecount"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 	vdropl(vp);
 }
 
 /*
  * Decrement only the use count and driver use count.  This is intended to
  * be paired with a follow on vdropl() to release the remaining hold count.
  * In this way we may vgone() a vnode with a 0 usecount without risk of
  * having it end up on a free list because the hold count is kept above 0.
  */
 static void
 v_decr_useonly(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_useonly: negative usecount"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it.  VI_DOOMED is set if the vnode
  * is being destroyed.  Only callers who specify LK_RETRY will
  * see doomed vnodes.  If inactive processing was delayed in
  * vput try to do it here.
  */
 int
 vget(struct vnode *vp, int flags, struct thread *td)
 {
 	int error;
 
 	error = 0;
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vget: invalid lock operation"));
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	vholdl(vp);
 	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
 		vdrop(vp);
 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 		    vp);
 		return (error);
 	}
 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 		panic("vget: vn_lock failed to return ENOENT\n");
 	VI_LOCK(vp);
 	/* Upgrade our holdcnt to a usecount. */
 	v_upgrade_usecount(vp);
 	/*
 	 * We don't guarantee that any particular close will
 	 * trigger inactive processing so just make a best effort
 	 * here at preventing a reference to a removed file.  If
 	 * we don't succeed no harm is done.
 	 */
 	if (vp->v_iflag & VI_OWEINACT) {
 		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
 		    (flags & LK_NOWAIT) == 0)
 			vinactive(vp, td);
 		vp->v_iflag &= ~VI_OWEINACT;
 	}
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Increase the reference count of a vnode.
  */
 void
 vref(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VI_LOCK(vp);
 	v_incr_usecount(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism other
  * than the VI lock is used to stop other processes from gaining references
  * to the vnode.  This may be the case if the caller holds the only reference.
  * This is also useful when stale data is acceptable as race conditions may
  * be accounted for by some other means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 	int usecnt;
 
 	VI_LOCK(vp);
 	usecnt = vp->v_usecount;
 	VI_UNLOCK(vp);
 
 	return (usecnt);
 }
 
 #define	VPUTX_VRELE	1
 #define	VPUTX_VPUT	2
 #define	VPUTX_VUNREF	3
 
 static void
 vputx(struct vnode *vp, int func)
 {
 	int error;
 
 	KASSERT(vp != NULL, ("vputx: null vp"));
 	if (func == VPUTX_VUNREF)
 		ASSERT_VOP_LOCKED(vp, "vunref");
 	else if (func == VPUTX_VPUT)
 		ASSERT_VOP_LOCKED(vp, "vput");
 	else
 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VI_LOCK(vp);
 
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vputx: missed vn_close"));
 	error = 0;
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		if (func == VPUTX_VPUT)
 			VOP_UNLOCK(vp, 0);
 		v_decr_usecount(vp);
 		return;
 	}
 
 	if (vp->v_usecount != 1) {
 		vprint("vputx: negative ref count", vp);
 		panic("vputx: negative ref cnt");
 	}
 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	/*
 	 * We must call VOP_INACTIVE with the node locked. Mark
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
 	switch (func) {
 	case VPUTX_VRELE:
 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 		VI_LOCK(vp);
 		break;
 	case VPUTX_VPUT:
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
 			    LK_NOWAIT);
 			VI_LOCK(vp);
 		}
 		break;
 	case VPUTX_VUNREF:
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
 			VI_LOCK(vp);
 		}
 		break;
 	}
 	if (vp->v_usecount > 0)
 		vp->v_iflag &= ~VI_OWEINACT;
 	if (error == 0) {
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, curthread);
 		if (func != VPUTX_VUNREF)
 			VOP_UNLOCK(vp, 0);
 	}
 	vdropl(vp);
 }
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(struct vnode *vp)
 {
 
 	vputx(vp, VPUTX_VRELE);
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() acquires the lock internally.)
  */
 void
 vput(struct vnode *vp)
 {
 
 	vputx(vp, VPUTX_VPUT);
 }
 
 /*
  * Release an exclusively locked vnode. Do not unlock the vnode lock.
  */
 void
 vunref(struct vnode *vp)
 {
 
 	vputx(vp, VPUTX_VUNREF);
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Increase the hold count and activate if this is the first reference.
  */
 void
 vholdl(struct vnode *vp)
 {
 	struct mount *mp;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	/* getnewvnode() calls v_incr_usecount() without holding interlock. */
 	if (vp->v_type != VNON || vp->v_data != NULL) {
 		ASSERT_VI_LOCKED(vp, "vholdl");
 		VNASSERT(vp->v_holdcnt > 0 || (vp->v_iflag & VI_FREE) != 0,
 		    vp, ("vholdl: free vnode is held"));
 	}
 #endif
 	vp->v_holdcnt++;
 	if ((vp->v_iflag & VI_FREE) == 0)
 		return;
 	VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
 	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
 	/*
 	 * Remove a vnode from the free list, mark it as in use,
 	 * and put it on the active list.
 	 */
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 	freevnodes--;
 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag |= VI_ACTIVE;
 	mp = vp->v_mount;
 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
 	mp->mnt_activevnodelistsize++;
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Note that there is one less who cares about this vnode.
  * vdrop() is the opposite of vhold().
  */
 void
 vdrop(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
  * (marked VI_DOOMED) in which case we will free it.
  */
 void
 vdropl(struct vnode *vp)
 {
 	struct bufobj *bo;
 	struct mount *mp;
 	int active;
 
 	ASSERT_VI_LOCKED(vp, "vdropl");
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
 	vp->v_holdcnt--;
 	VNASSERT(vp->v_holdcnt >= vp->v_usecount, vp,
 	    ("hold count less than use count"));
 	if (vp->v_holdcnt > 0) {
 		VI_UNLOCK(vp);
 		return;
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0) {
 		/*
 		 * Mark a vnode as free: remove it from its active list
 		 * and put it up for recycling on the freelist.
 		 */
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vdropl: vnode already reclaimed."));
 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 		    ("vnode already free"));
 		VNASSERT(vp->v_holdcnt == 0, vp,
 		    ("vdropl: freeing when we shouldn't"));
 		active = vp->v_iflag & VI_ACTIVE;
 		vp->v_iflag &= ~VI_ACTIVE;
 		mp = vp->v_mount;
 		mtx_lock(&vnode_free_list_mtx);
 		if (active) {
 			TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
 			    v_actfreelist);
 			mp->mnt_activevnodelistsize--;
 		}
 		if (vp->v_iflag & VI_AGE) {
 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
 		} else {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
 		}
 		freevnodes++;
 		vp->v_iflag &= ~VI_AGE;
 		vp->v_iflag |= VI_FREE;
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		return;
 	}
 	/*
 	 * The vnode has been marked for destruction, so free it.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	atomic_subtract_long(&numvnodes, 1);
 	bo = &vp->v_bufobj;
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("cleaned vnode still on the free list."));
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
 	    ("clean blk trie not empty"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 	    ("dirty blk trie not empty"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
 	if (vp->v_pollinfo != NULL)
 		destroy_vpollinfo(vp->v_pollinfo);
 #ifdef INVARIANTS
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
 	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	rw_destroy(BO_LOCKPTR(bo));
 	uma_zfree(vnode_zone, vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  * OWEINACT tracks whether a vnode missed a call to inactive due to a
  * failed lock upgrade.
  */
 void
 vinactive(struct vnode *vp, struct thread *td)
 {
 	struct vm_object *obj;
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	/*
 	 * Before moving off the active list, we must be sure that any
 	 * modified pages are on the vnode's dirty list since these will
 	 * no longer be checked once the vnode is on the inactive list.
 	 * Because the vnode vm object keeps a hold reference on the vnode
 	 * if there is at least one resident non-cached page, the vnode
 	 * cannot leave the active list without the page cleanup done.
 	 */
 	obj = vp->v_object;
 	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		VM_OBJECT_WLOCK(obj);
 		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
 #endif
 
 int
 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
 	    rootrefs, flags);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
 			    __func__, error);
 			return (error);
 		}
 		vput(rootvp);
 	}
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		vholdl(vp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 		if (error) {
 			vdrop(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0);
 			vdrop(vp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			if (vp->v_object != NULL) {
 				VM_OBJECT_WLOCK(vp->v_object);
 				vm_object_page_clean(vp->v_object, 0, 0, 0);
 				VM_OBJECT_WUNLOCK(vp->v_object);
 			}
 			error = VOP_FSYNC(vp, MNT_WAIT, td);
 			if (error != 0) {
 				VOP_UNLOCK(vp, 0);
 				vdrop(vp);
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				return (error);
 			}
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, 0);
 				vdropl(vp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			VNASSERT(vp->v_usecount == 0 ||
 			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
 			    ("device VNODE %p is FORCECLOSED", vp));
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vprint("vflush: busy vnode", vp);
 #endif
 		}
 		VOP_UNLOCK(vp, 0);
 		vdropl(vp);
 	}
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp, 0);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy) {
 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
 		    busy);
 		return (EBUSY);
 	}
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp)
 {
 	int recycled;
 
 	ASSERT_VOP_ELOCKED(vp, "vrecycle");
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	recycled = 0;
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 static void
 notify_lowervp_vfs_dummy(struct mount *mp __unused,
     struct vnode *lowervp __unused)
 {
 }
 
 /*
  * Notify upper mounts about reclaimed or unlinked vnode.
  */
 void
 vfs_notify_upper(struct vnode *vp, int event)
 {
 	static struct vfsops vgonel_vfsops = {
 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
 	};
 	struct mount *mp, *ump, *mmp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 
 	MNT_ILOCK(mp);
 	if (TAILQ_EMPTY(&mp->mnt_uppers))
 		goto unlock;
 	MNT_IUNLOCK(mp);
 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
 	mmp->mnt_op = &vgonel_vfsops;
 	mmp->mnt_kern_flag |= MNTK_MARKER;
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
 			ump = TAILQ_NEXT(ump, mnt_upper_link);
 			continue;
 		}
 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
 		MNT_IUNLOCK(mp);
 		switch (event) {
 		case VFS_NOTIFY_UPPER_RECLAIM:
 			VFS_RECLAIM_LOWERVP(ump, vp);
 			break;
 		case VFS_NOTIFY_UPPER_UNLINK:
 			VFS_UNLINK_LOWERVP(ump, vp);
 			break;
 		default:
 			KASSERT(0, ("invalid event %d", event));
 			break;
 		}
 		MNT_ILOCK(mp);
 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
 	}
 	free(mmp, M_TEMP);
 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
 		wakeup(&mp->mnt_uppers);
 	}
 unlock:
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	int oweinact;
 	int active;
 	struct mount *mp;
 
 	ASSERT_VOP_ELOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (vp->v_iflag & VI_DOOMED)
 		return;
 	vp->v_iflag |= VI_DOOMED;
 
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to call
 	 * VOP_CLOSE() and VOP_INACTIVE().
 	 */
 	active = vp->v_usecount;
 	oweinact = (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	if (vp->v_type == VSOCK)
 		vfs_unp_reclaim(vp);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	mp = NULL;
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
 		while (vinvalbuf(vp, 0, 0, 0) != 0)
 			;
 	}
 #ifdef INVARIANTS
 	BO_LOCK(&vp->v_bufobj);
 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
 	    ("vp %p bufobj not invalidated", vp));
 	vp->v_bufobj.bo_flag |= BO_DEAD;
 	BO_UNLOCK(&vp->v_bufobj);
 #endif
 
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 	/*
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	(void)VOP_ADVLOCKPURGE(vp);
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	VI_LOCK(vp);
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_tag = "none";
 	vp->v_type = VBAD;
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(struct vnode *vp)
 {
 	int count;
 
 	dev_lock();
 	count = vp->v_rdev->si_usecount;
 	dev_unlock();
 	return (count);
 }
 
 /*
  * Same as above, but using the struct cdev *as argument
  */
 int
 count_dev(struct cdev *dev)
 {
 	int count;
 
 	dev_lock();
 	count = dev->si_usecount;
 	dev_unlock();
 	return(count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
  "VMARKER"};
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[256], buf2[16];
 	u_long flags;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strlcat(buf, "|VV_ROOT", sizeof(buf));
 	if (vp->v_vflag & VV_ISTTY)
 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
 	if (vp->v_vflag & VV_NOSYNC)
 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 	if (vp->v_vflag & VV_ETERNALDEV)
 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 	if (vp->v_vflag & VV_CACHEDLABEL)
 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 	if (vp->v_vflag & VV_TEXT)
 		strlcat(buf, "|VV_TEXT", sizeof(buf));
 	if (vp->v_vflag & VV_COPYONWRITE)
 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 	if (vp->v_vflag & VV_PROCDEP)
 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 	if (vp->v_vflag & VV_NOKNOTE)
 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
 	if (vp->v_vflag & VV_DELETED)
 		strlcat(buf, "|VV_DELETED", sizeof(buf));
 	if (vp->v_vflag & VV_MD)
 		strlcat(buf, "|VV_MD", sizeof(buf));
 	if (vp->v_vflag & VV_FORCEINSMQ)
 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
 	if (vp->v_iflag & VI_AGE)
 		strlcat(buf, "|VI_AGE", sizeof(buf));
 	if (vp->v_iflag & VI_DOOMED)
 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
 	if (vp->v_iflag & VI_FREE)
 		strlcat(buf, "|VI_FREE", sizeof(buf));
 	if (vp->v_iflag & VI_ACTIVE)
 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
 	if (vp->v_iflag & VI_DOINGINACT)
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d "
 		    "cleanbuf %d dirtybuf %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count,
 		    vp->v_bufobj.bo_dirty.bv_cnt,
 		    vp->v_bufobj.bo_clean.bv_cnt);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	db_printf("Locked vnodes\n");
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 				vprint("", vp);
 		}
 	}
 }
 
 /*
  * Show details about the given vnode.
  */
 DB_SHOW_COMMAND(vnode, db_show_vnode)
 {
 	struct vnode *vp;
 
 	if (!have_addr)
 		return;
 	vp = (struct vnode *)addr;
 	vn_printf(vp, "vnode ");
 }
 
 /*
  * Show details about the given mount point.
  */
 DB_SHOW_COMMAND(mount, db_show_mount)
 {
 	struct mount *mp;
 	struct vfsopt *opt;
 	struct statfs *sp;
 	struct vnode *vp;
 	char buf[512];
 	uint64_t mflags;
 	u_int flags;
 
 	if (!have_addr) {
 		/* No address given, print short info about all mount points. */
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			db_printf("%p %s on %s (%s)\n", mp,
 			    mp->mnt_stat.f_mntfromname,
 			    mp->mnt_stat.f_mntonname,
 			    mp->mnt_stat.f_fstypename);
 			if (db_pager_quit)
 				break;
 		}
 		db_printf("\nMore info: show mount <addr>\n");
 		return;
 	}
 
 	mp = (struct mount *)addr;
 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
 
 	buf[0] = '\0';
 	mflags = mp->mnt_flag;
 #define	MNT_FLAG(flag)	do {						\
 	if (mflags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
 		mflags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_FLAG(MNT_RDONLY);
 	MNT_FLAG(MNT_SYNCHRONOUS);
 	MNT_FLAG(MNT_NOEXEC);
 	MNT_FLAG(MNT_NOSUID);
 	MNT_FLAG(MNT_NFS4ACLS);
 	MNT_FLAG(MNT_UNION);
 	MNT_FLAG(MNT_ASYNC);
 	MNT_FLAG(MNT_SUIDDIR);
 	MNT_FLAG(MNT_SOFTDEP);
 	MNT_FLAG(MNT_NOSYMFOLLOW);
 	MNT_FLAG(MNT_GJOURNAL);
 	MNT_FLAG(MNT_MULTILABEL);
 	MNT_FLAG(MNT_ACLS);
 	MNT_FLAG(MNT_NOATIME);
 	MNT_FLAG(MNT_NOCLUSTERR);
 	MNT_FLAG(MNT_NOCLUSTERW);
 	MNT_FLAG(MNT_SUJ);
 	MNT_FLAG(MNT_EXRDONLY);
 	MNT_FLAG(MNT_EXPORTED);
 	MNT_FLAG(MNT_DEFEXPORTED);
 	MNT_FLAG(MNT_EXPORTANON);
 	MNT_FLAG(MNT_EXKERB);
 	MNT_FLAG(MNT_EXPUBLIC);
 	MNT_FLAG(MNT_LOCAL);
 	MNT_FLAG(MNT_QUOTA);
 	MNT_FLAG(MNT_ROOTFS);
 	MNT_FLAG(MNT_USER);
 	MNT_FLAG(MNT_IGNORE);
 	MNT_FLAG(MNT_UPDATE);
 	MNT_FLAG(MNT_DELEXPORT);
 	MNT_FLAG(MNT_RELOAD);
 	MNT_FLAG(MNT_FORCE);
 	MNT_FLAG(MNT_SNAPSHOT);
 	MNT_FLAG(MNT_BYFSID);
 #undef MNT_FLAG
 	if (mflags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%016jx", mflags);
 	}
 	db_printf("    mnt_flag = %s\n", buf);
 
 	buf[0] = '\0';
 	flags = mp->mnt_kern_flag;
 #define	MNT_KERN_FLAG(flag)	do {					\
 	if (flags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
 		flags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
 	MNT_KERN_FLAG(MNTK_ASYNC);
 	MNT_KERN_FLAG(MNTK_SOFTDEP);
 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
 	MNT_KERN_FLAG(MNTK_DRAINING);
 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 	MNT_KERN_FLAG(MNTK_NO_IOPF);
 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
 	MNT_KERN_FLAG(MNTK_MARKER);
 	MNT_KERN_FLAG(MNTK_NOASYNC);
 	MNT_KERN_FLAG(MNTK_UNMOUNT);
 	MNT_KERN_FLAG(MNTK_MWAIT);
 	MNT_KERN_FLAG(MNTK_SUSPEND);
 	MNT_KERN_FLAG(MNTK_SUSPEND2);
 	MNT_KERN_FLAG(MNTK_SUSPENDED);
 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 	MNT_KERN_FLAG(MNTK_NOKNOTE);
 #undef MNT_KERN_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%08x", flags);
 	}
 	db_printf("    mnt_kern_flag = %s\n", buf);
 
 	db_printf("    mnt_opt = ");
 	opt = TAILQ_FIRST(mp->mnt_opt);
 	if (opt != NULL) {
 		db_printf("%s", opt->name);
 		opt = TAILQ_NEXT(opt, link);
 		while (opt != NULL) {
 			db_printf(", %s", opt->name);
 			opt = TAILQ_NEXT(opt, link);
 		}
 	}
 	db_printf("\n");
 
 	sp = &mp->mnt_stat;
 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
 
 	db_printf("    mnt_cred = { uid=%u ruid=%u",
 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
 	if (jailed(mp->mnt_cred))
 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
 	db_printf(" }\n");
 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
 	db_printf("    mnt_activevnodelistsize = %d\n",
 	    mp->mnt_activevnodelistsize);
 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 	db_printf("    mnt_secondary_accwrites = %d\n",
 	    mp->mnt_secondary_accwrites);
 	db_printf("    mnt_gjprovider = %s\n",
 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
 
 	db_printf("\n\nList of active vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
 		if (vp->v_type != VMARKER) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 	db_printf("\n\nList of inactive vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 #endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static int
 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp.vfc_vfsops = NULL;
 	xvfsp.vfc_next = NULL;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 
 #ifdef COMPAT_FREEBSD32
 struct xvfsconf32 {
 	uint32_t	vfc_vfsops;
 	char		vfc_name[MFSNAMELEN];
 	int32_t		vfc_typenum;
 	int32_t		vfc_refcount;
 	int32_t		vfc_flags;
 	uint32_t	vfc_next;
 };
 
 static int
 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf32 xvfsp;
 
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	xvfsp.vfc_vfsops = 0;
 	xvfsp.vfc_next = 0;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	int error;
 
 	error = 0;
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			error = vfsconf2x32(req, vfsp);
 		else
 #endif
 			error = vfsconf2x(req, vfsp);
 		if (error)
 			break;
 	}
 	vfsconf_sunlock();
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 	log(LOG_WARNING, "userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		vfsconf_slock();
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		}
 		vfsconf_sunlock();
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			return (vfsconf2x32(req, vfsp));
 		else
 #endif
 			return (vfsconf2x(req, vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, vfs_sysctl,
     "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error != 0) {
 			vfsconf_sunlock();
 			return (error);
 		}
 	}
 	vfsconf_sunlock();
 	return (0);
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
     "");
 #endif
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp;
 	struct thread *td;
 	int error;
 
 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
 	td = curthread;
 
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, td);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			/*
 			 * XXX: Due to the way in which we mount the root
 			 * file system off of devfs, devfs will generate a
 			 * "busy" warning when we try to unmount it before
 			 * the root.  Don't print a warning as a result in
 			 * order to avoid false positive errors that may
 			 * cause needless upset.
 			 */
 			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
 				printf("unmount of %s failed (",
 				    mp->mnt_stat.f_mntonname);
 				if (error == EBUSY)
 					printf("BUSY)\n");
 				else
 					printf("%d)\n", error);
 			}
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
 		obj = vp->v_object;
 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					continue;
 				}
 
 				obj = vp->v_object;
 				if (obj != NULL) {
 					VM_OBJECT_WLOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_WUNLOCK(obj);
 				}
 				vput(vp);
 			}
 		} else
 			VI_UNLOCK(vp);
 	}
 }
 
 static void
 destroy_vpollinfo_free(struct vpollinfo *vi)
 {
 
 	knlist_destroy(&vi->vpi_selinfo.si_note);
 	mtx_destroy(&vi->vpi_lock);
 	uma_zfree(vnodepoll_zone, vi);
 }
 
 static void
 destroy_vpollinfo(struct vpollinfo *vi)
 {
 
 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
 	seldrain(&vi->vpi_selinfo);
 	destroy_vpollinfo_free(vi);
 }
 
 /*
  * Initalize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	if (vp->v_pollinfo != NULL)
 		return;
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
 		destroy_vpollinfo_free(vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	VI_UNLOCK(vp);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return (events);
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return (0);
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock1 =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 void
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
 	vp->v_type = VNON;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_FORCEINSMQ;
 	error = insmntque(vp, mp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: insmntque() failed");
 	vp->v_vflag &= ~VV_FORCEINSMQ;
 	VOP_UNLOCK(vp, 0);
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	if (mp->mnt_syncer == NULL) {
 		mp->mnt_syncer = vp;
 		vp = NULL;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 	if (vp != NULL) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vgone(vp);
 		vput(vp);
 	}
 }
 
 void
 vfs_deallocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_lock(&sync_mtx);
 	vp = mp->mnt_syncer;
 	if (vp != NULL)
 		mp->mnt_syncer = NULL;
 	mtx_unlock(&sync_mtx);
 	if (vp != NULL)
 		vrele(vp);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	int error, save;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
 		return (0);
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp);
 		return (0);
 	}
 	save = curthread_pflags_set(TDP_SYNCIO);
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY);
 	curthread_pflags_restore(save);
 	vn_finished_write(mp);
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	mtx_lock(&sync_mtx);
 	if (vp->v_mount->mnt_syncer == vp)
 		vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		sync_vnode_count--;
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 
 	return (0);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(struct vnode *vp, int *errp)
 {
 	int error;
 
 	if (vp->v_type != VCHR) {
 		error = ENOTBLK;
 		goto out;
 	}
 	error = 0;
 	dev_lock();
 	if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 out:
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     accmode_t accmode, struct ucred *cred, int *privused)
 {
 	accmode_t dac_granted;
 	accmode_t priv_granted;
 
 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 	    ("invalid bit in accmode"));
 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 	    ("VAPPEND without VWRITE"));
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((accmode & dac_granted) == accmode)
 		return (0);
 
 privcheck:
 	/*
 	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.  For each privilege, if the privilege is required,
 	 * bitwise or the request type onto the priv_granted mask.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
 			priv_granted |= VEXEC;
 	} else {
 		/*
 		 * Ensure that at least one execute bit is on. Otherwise,
 		 * a privileged user will always succeed, and we don't want
 		 * this to happen unless the file really is executable.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
 			priv_granted |= VEXEC;
 	}
 
 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
 		priv_granted |= VREAD;
 
 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
 		priv_granted |= VADMIN;
 
 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 	return ((accmode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, accmode_t accmode)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, accmode, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to supress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
     "Drop into debugger on lock violation");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
     0, "Check for interlock across VOPs");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
     0, "Print lock violations");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 	int locked;
 
 	if (!IGNORE_LOCK(vp)) {
 		locked = VOP_ISLOCKED(vp);
 		if (locked == 0 || locked == LK_EXCLOTHER)
 			vfs_badlock("is not locked but should be", str, vp);
 	}
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 
 #if 0
 void
 assert_vop_elocked_other(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
 		vfs_badlock("is not exclusive locked by another thread",
 		    str, vp);
 }
 
 void
 assert_vop_slocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
 		vfs_badlock("is not locked shared but should be", str, vp);
 }
 #endif /* 0 */
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_fail(struct vop_rename_args *ap)
 {
 
 	if (ap->a_tvp != NULL)
 		vput(ap->a_tvp);
 	if (ap->a_tdvp == ap->a_tvp)
 		vrele(ap->a_tdvp);
 	else
 		vput(ap->a_tdvp);
 	vrele(ap->a_fdvp);
 	vrele(ap->a_fvp);
 }
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 void
 vop_strategy_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 	}
 #endif
 }
 
 void
 vop_lock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lock1_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lock1_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_deleteextattr_post(void *ap, int rc)
 {
 	struct vop_deleteextattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_setextattr_post(void *ap, int rc)
 {
 	struct vop_setextattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init_mtx(&fs_knlist, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_fsattach,
 	.f_detach = filt_fsdetach,
 	.f_event = filt_fsevent
 };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	error = VFS_SYSCTL(mp, vc.vc_op, req);
 	vfs_rel(mp);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
     NULL, 0, sysctl_vfs_ctl, "",
     "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsread
 };
 static struct filterops vfswrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfswrite
 };
 static struct filterops vfsvnode_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsvnode
 };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp, 0);
 }
 
 static void
 vfs_knl_assert_locked(void *arg)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
 #endif
 }
 
 static void
 vfs_knl_assert_unlocked(void *arg)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
 #endif
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	vhold(vp);
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 	vdrop(vp);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct vattr va;
 	int res;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE) {
 		VI_LOCK(vp);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		VI_UNLOCK(vp);
 		return (1);
 	}
 
 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
 		return (0);
 
 	VI_LOCK(vp);
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 	res = (kn->kn_data != 0);
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	VI_LOCK(vp);
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE)
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	VI_UNLOCK(vp);
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	int res;
 
 	VI_LOCK(vp);
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= EV_EOF;
 		VI_UNLOCK(vp);
 		return (1);
 	}
 	res = (kn->kn_fflags != 0);
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	return (0);
 }
 
 /*
  * Mark for update the access time of the file if the filesystem
  * supports VOP_MARKATIME.  This functionality is used by execve and
  * mmap, so we want to avoid the I/O implied by directly setting
  * va_atime for the sake of efficiency.
  */
 void
 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
 {
 	struct mount *mp;
 
 	mp = vp->v_mount;
 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		(void)VOP_MARKATIME(vp);
 }
 
 /*
  * The purpose of this routine is to remove granularity from accmode_t,
  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
  * VADMIN and VAPPEND.
  *
  * If it returns 0, the caller is supposed to continue with the usual
  * access checks using 'accmode' as modified by this routine.  If it
  * returns nonzero value, the caller is supposed to return that value
  * as errno.
  *
  * Note that after this routine runs, accmode may be zero.
  */
 int
 vfs_unixify_accmode(accmode_t *accmode)
 {
 	/*
 	 * There is no way to specify explicit "deny" rule using
 	 * file mode or POSIX.1e ACLs.
 	 */
 	if (*accmode & VEXPLICIT_DENY) {
 		*accmode = 0;
 		return (0);
 	}
 
 	/*
 	 * None of these can be translated into usual access bits.
 	 * Also, the common case for NFSv4 ACLs is to not contain
 	 * either of these bits. Caller should check for VWRITE
 	 * on the containing directory instead.
 	 */
 	if (*accmode & (VDELETE_CHILD | VDELETE))
 		return (EPERM);
 
 	if (*accmode & VADMIN_PERMS) {
 		*accmode &= ~VADMIN_PERMS;
 		*accmode |= VADMIN;
 	}
 
 	/*
 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
 	 */
 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
 
 	return (0);
 }
 
 /*
  * These are helper functions for filesystems to traverse all
  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
  *
  * This interface replaces MNT_VNODE_FOREACH.
  */
 
 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 
 struct vnode *
 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 	MNT_ILOCK(mp);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
 	while (vp != NULL && (vp->v_type == VMARKER ||
 	    (vp->v_iflag & VI_DOOMED) != 0))
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		__mnt_vnode_markerfree_all(mvp, mp);
 		/* MNT_IUNLOCK(mp); -- done in above function */
 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
 		return (NULL);
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	VI_LOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	(*mvp)->v_type = VMARKER;
 
 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 	while (vp != NULL && (vp->v_type == VMARKER ||
 	    (vp->v_iflag & VI_DOOMED) != 0))
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		free(*mvp, M_VNODE_MARKER);
 		*mvp = NULL;
 		return (NULL);
 	}
 	(*mvp)->v_mount = mp;
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	VI_LOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 
 void
 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	free(*mvp, M_VNODE_MARKER);
 	*mvp = NULL;
 }
 
 /*
  * These are helper functions for filesystems to traverse their
  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
  */
 static void
 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
 {
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	free(*mvp, M_VNODE_MARKER);
 	*mvp = NULL;
 }
 
 static struct vnode *
 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp, *nvp;
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 restart:
 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
 	while (vp != NULL) {
 		if (vp->v_type == VMARKER) {
 			vp = TAILQ_NEXT(vp, v_actfreelist);
 			continue;
 		}
 		if (!VI_TRYLOCK(vp)) {
 			if (mp_ncpus == 1 || should_yield()) {
 				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
 				mtx_unlock(&vnode_free_list_mtx);
 				pause("vnacti", 1);
 				mtx_lock(&vnode_free_list_mtx);
 				goto restart;
 			}
 			continue;
 		}
 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
 		    ("alien vnode on the active list %p %p", vp, mp));
 		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
 			break;
 		nvp = TAILQ_NEXT(vp, v_actfreelist);
 		VI_UNLOCK(vp);
 		vp = nvp;
 	}
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		mtx_unlock(&vnode_free_list_mtx);
 		mnt_vnode_markerfree_active(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
 	mtx_unlock(&vnode_free_list_mtx);
 	ASSERT_VI_LOCKED(vp, "active iter");
 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
 {
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 	mtx_lock(&vnode_free_list_mtx);
 	return (mnt_vnode_next_active(mvp, mp));
 }
 
 struct vnode *
 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 	(*mvp)->v_type = VMARKER;
 	(*mvp)->v_mount = mp;
 
 	mtx_lock(&vnode_free_list_mtx);
 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
 	if (vp == NULL) {
 		mtx_unlock(&vnode_free_list_mtx);
 		mnt_vnode_markerfree_active(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
 	return (mnt_vnode_next_active(mvp, mp));
 }
 
 void
 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL)
 		return;
 
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
 	mtx_unlock(&vnode_free_list_mtx);
 	mnt_vnode_markerfree_active(mvp, mp);
 }
Index: projects/building-blocks/sys
===================================================================
--- projects/building-blocks/sys	(revision 277723)
+++ projects/building-blocks/sys	(revision 277724)

Property changes on: projects/building-blocks/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys:r277711-277723
Index: projects/building-blocks
===================================================================
--- projects/building-blocks	(revision 277723)
+++ projects/building-blocks	(revision 277724)

Property changes on: projects/building-blocks
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r277711-277723