Index: head/sys/amd64/amd64/db_disasm.c
===================================================================
--- head/sys/amd64/amd64/db_disasm.c	(revision 314067)
+++ head/sys/amd64/amd64/db_disasm.c	(revision 314068)
@@ -1,1727 +1,1727 @@
 /*-
  * Mach Operating System
  * Copyright (c) 1991,1990 Carnegie Mellon University
  * All Rights Reserved.
  *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Instruction disassembler.
  */
 #include <sys/param.h>
 #include <sys/libkern.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_access.h>
 #include <ddb/db_sym.h>
 
 /*
  * Size attributes
  */
 #define	BYTE	0
 #define	WORD	1
 #define	LONG	2
 #define	QUAD	3
 #define	SNGL	4
 #define	DBLR	5
 #define	EXTR	6
 #define	SDEP	7
 #define	ADEP	8
 #define	ESC	9
 #define	NONE	10
 
 /*
  * REX prefix and bits
  */
 #define REX_B	1
 #define REX_X	2
 #define REX_R	4
 #define REX_W	8
 #define REX	0x40
 
 /*
  * Addressing modes
  */
 #define	E	1			/* general effective address */
 #define	Eind	2			/* indirect address (jump, call) */
 #define	Ew	3			/* address, word size */
 #define	Eb	4			/* address, byte size */
 #define	R	5			/* register, in 'reg' field */
 #define	Rw	6			/* word register, in 'reg' field */
 #define	Rq	39			/* quad register, in 'reg' field */
 #define	Rv	40			/* register in 'r/m' field */
 #define	Ri	7			/* register in instruction */
 #define	S	8			/* segment reg, in 'reg' field */
 #define	Si	9			/* segment reg, in instruction */
 #define	A	10			/* accumulator */
 #define	BX	11			/* (bx) */
 #define	CL	12			/* cl, for shifts */
 #define	DX	13			/* dx, for IO */
 #define	SI	14			/* si */
 #define	DI	15			/* di */
 #define	CR	16			/* control register */
 #define	DR	17			/* debug register */
 #define	TR	18			/* test register */
 #define	I	19			/* immediate, unsigned */
 #define	Is	20			/* immediate, signed */
 #define	Ib	21			/* byte immediate, unsigned */
 #define	Ibs	22			/* byte immediate, signed */
 #define	Iw	23			/* word immediate, unsigned */
 #define	Ilq	24			/* long/quad immediate, unsigned */
 #define	O	25			/* direct address */
 #define	Db	26			/* byte displacement from EIP */
 #define	Dl	27			/* long displacement from EIP */
 #define	o1	28			/* constant 1 */
 #define	o3	29			/* constant 3 */
 #define	OS	30			/* immediate offset/segment */
 #define	ST	31			/* FP stack top */
 #define	STI	32			/* FP stack */
 #define	X	33			/* extended FP op */
 #define	XA	34			/* for 'fstcw %ax' */
 #define	El	35			/* address, long/quad size */
 #define	Ril	36			/* long register in instruction */
 #define	Iba	37			/* byte immediate, don't print if 0xa */
 #define	EL	38			/* address, explicitly long size */
 
 struct inst {
 	const char *	i_name;		/* name */
 	short	i_has_modrm;		/* has regmodrm byte */
 	short	i_size;			/* operand size */
 	int	i_mode;			/* addressing modes */
 	const void *	i_extra;	/* pointer to extra opcode table */
 };
 
 #define	op1(x)		(x)
 #define	op2(x,y)	((x)|((y)<<8))
 #define	op3(x,y,z)	((x)|((y)<<8)|((z)<<16))
 
 struct finst {
 	const char *	f_name;		/* name for memory instruction */
 	int	f_size;			/* size for memory instruction */
 	int	f_rrmode;		/* mode for rr instruction */
 	const void *	f_rrname;	/* name for rr instruction
 					   (or pointer to table) */
 };
 
 static const struct inst db_inst_0f388x[] = {
 /*80*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invept" },
 /*81*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invvpid" },
 /*82*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invpcid" },
 /*83*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*84*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*85*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*86*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*87*/	{ "",	   FALSE, NONE,  0,	      0 },
 
 /*88*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*89*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*8a*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*8b*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*8c*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*8d*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*8e*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*8f*/	{ "",	   FALSE, NONE,  0,	      0 },
 };
 
 static const struct inst * const db_inst_0f38[] = {
 	0,
 	0,
 	0,
 	0,
 	0,
 	0,
 	0,
 	0,
 	db_inst_0f388x,
 	0,
 	0,
 	0,
 	0,
 	0,
 	0,
 	0
 };
 
 static const char * const db_Grp6[] = {
 	"sldt",
 	"str",
 	"lldt",
 	"ltr",
 	"verr",
 	"verw",
 	"",
 	""
 };
 
 static const char * const db_Grp7[] = {
 	"sgdt",
 	"sidt",
 	"lgdt",
 	"lidt",
 	"smsw",
 	"",
 	"lmsw",
 	"invlpg"
 };
 
 static const char * const db_Grp8[] = {
 	"",
 	"",
 	"",
 	"",
 	"bt",
 	"bts",
 	"btr",
 	"btc"
 };
 
 static const char * const db_Grp9[] = {
 	"",
 	"cmpxchg8b",
 	"",
 	"",
 	"",
 	"",
 	"vmptrld",
 	"vmptrst"
 };
 
 static const char * const db_Grp15[] = {
 	"fxsave",
 	"fxrstor",
 	"ldmxcsr",
 	"stmxcsr",
 	"xsave",
 	"xrstor",
 	"xsaveopt",
 	"clflush"
 };
 
 static const char * const db_Grp15b[] = {
 	"",
 	"",
 	"",
 	"",
 	"",
 	"lfence",
 	"mfence",
 	"sfence"
 };
 
 static const struct inst db_inst_0f0x[] = {
 /*00*/	{ "",	   TRUE,  NONE,  op1(Ew),     db_Grp6 },
 /*01*/	{ "",	   TRUE,  NONE,  op1(Ew),     db_Grp7 },
 /*02*/	{ "lar",   TRUE,  LONG,  op2(E,R),    0 },
 /*03*/	{ "lsl",   TRUE,  LONG,  op2(E,R),    0 },
 /*04*/	{ "",      FALSE, NONE,  0,	      0 },
 /*05*/	{ "syscall",FALSE,NONE,  0,	      0 },
 /*06*/	{ "clts",  FALSE, NONE,  0,	      0 },
 /*07*/	{ "sysret",FALSE, NONE,  0,	      0 },
 
 /*08*/	{ "invd",  FALSE, NONE,  0,	      0 },
 /*09*/	{ "wbinvd",FALSE, NONE,  0,	      0 },
 /*0a*/	{ "",      FALSE, NONE,  0,	      0 },
 /*0b*/	{ "",      FALSE, NONE,  0,	      0 },
 /*0c*/	{ "",      FALSE, NONE,  0,	      0 },
 /*0d*/	{ "",      FALSE, NONE,  0,	      0 },
 /*0e*/	{ "",      FALSE, NONE,  0,	      0 },
 /*0f*/	{ "",      FALSE, NONE,  0,	      0 },
 };
 
 static const struct inst db_inst_0f1x[] = {
 /*10*/	{ "",      FALSE, NONE,  0,	      0 },
 /*11*/	{ "",      FALSE, NONE,  0,	      0 },
 /*12*/	{ "",      FALSE, NONE,  0,	      0 },
 /*13*/	{ "",      FALSE, NONE,  0,	      0 },
 /*14*/	{ "",      FALSE, NONE,  0,	      0 },
 /*15*/	{ "",      FALSE, NONE,  0,	      0 },
 /*16*/	{ "",      FALSE, NONE,  0,	      0 },
 /*17*/	{ "",      FALSE, NONE,  0,	      0 },
 
 /*18*/	{ "",      FALSE, NONE,  0,	      0 },
 /*19*/	{ "",      FALSE, NONE,  0,	      0 },
 /*1a*/	{ "",      FALSE, NONE,  0,	      0 },
 /*1b*/	{ "",      FALSE, NONE,  0,	      0 },
 /*1c*/	{ "",      FALSE, NONE,  0,	      0 },
 /*1d*/	{ "",      FALSE, NONE,  0,	      0 },
 /*1e*/	{ "",      FALSE, NONE,  0,	      0 },
 /*1f*/	{ "nopl",  TRUE,  SDEP,  0,	      "nopw" },
 };
 
 static const struct inst db_inst_0f2x[] = {
 /*20*/	{ "mov",   TRUE,  LONG,  op2(CR,El),  0 },
 /*21*/	{ "mov",   TRUE,  LONG,  op2(DR,El),  0 },
 /*22*/	{ "mov",   TRUE,  LONG,  op2(El,CR),  0 },
 /*23*/	{ "mov",   TRUE,  LONG,  op2(El,DR),  0 },
 /*24*/	{ "mov",   TRUE,  LONG,  op2(TR,El),  0 },
 /*25*/	{ "",      FALSE, NONE,  0,	      0 },
 /*26*/	{ "mov",   TRUE,  LONG,  op2(El,TR),  0 },
 /*27*/	{ "",      FALSE, NONE,  0,	      0 },
 
 /*28*/	{ "",      FALSE, NONE,  0,	      0 },
 /*29*/	{ "",      FALSE, NONE,  0,	      0 },
 /*2a*/	{ "",      FALSE, NONE,  0,	      0 },
 /*2b*/	{ "",      FALSE, NONE,  0,	      0 },
 /*2c*/	{ "",      FALSE, NONE,  0,	      0 },
 /*2d*/	{ "",      FALSE, NONE,  0,	      0 },
 /*2e*/	{ "",      FALSE, NONE,  0,	      0 },
 /*2f*/	{ "",      FALSE, NONE,  0,	      0 },
 };
 
 static const struct inst db_inst_0f3x[] = {
 /*30*/	{ "wrmsr", FALSE, NONE,  0,	      0 },
 /*31*/	{ "rdtsc", FALSE, NONE,  0,	      0 },
 /*32*/	{ "rdmsr", FALSE, NONE,  0,	      0 },
 /*33*/	{ "rdpmc", FALSE, NONE,  0,	      0 },
 /*34*/	{ "sysenter",FALSE,NONE,  0,	      0 },
 /*35*/	{ "sysexit",FALSE,NONE,  0,	      0 },
 /*36*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*37*/	{ "getsec",FALSE, NONE,  0,	      0 },
 
 /*38*/	{ "",	   FALSE, ESC,  0,	      db_inst_0f38 },
 /*39*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3a*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3b*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3c*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3d*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3e*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3f*/	{ "",	   FALSE, NONE,  0,	      0 },
 };
 
 static const struct inst db_inst_0f4x[] = {
 /*40*/	{ "cmovo",  TRUE, NONE,  op2(E, R),   0 },
 /*41*/	{ "cmovno", TRUE, NONE,  op2(E, R),   0 },
 /*42*/	{ "cmovb",  TRUE, NONE,  op2(E, R),   0 },
 /*43*/	{ "cmovnb", TRUE, NONE,  op2(E, R),   0 },
 /*44*/	{ "cmovz",  TRUE, NONE,  op2(E, R),   0 },
 /*45*/	{ "cmovnz", TRUE, NONE,  op2(E, R),   0 },
 /*46*/	{ "cmovbe", TRUE, NONE,  op2(E, R),   0 },
 /*47*/	{ "cmovnbe",TRUE, NONE,  op2(E, R),   0 },
 
 /*48*/	{ "cmovs",  TRUE, NONE,  op2(E, R),   0 },
 /*49*/	{ "cmovns", TRUE, NONE,  op2(E, R),   0 },
 /*4a*/	{ "cmovp",  TRUE, NONE,  op2(E, R),   0 },
 /*4b*/	{ "cmovnp", TRUE, NONE,  op2(E, R),   0 },
 /*4c*/	{ "cmovl",  TRUE, NONE,  op2(E, R),   0 },
 /*4d*/	{ "cmovnl", TRUE, NONE,  op2(E, R),   0 },
 /*4e*/	{ "cmovle", TRUE, NONE,  op2(E, R),   0 },
 /*4f*/	{ "cmovnle",TRUE, NONE,  op2(E, R),   0 },
 };
 
 static const struct inst db_inst_0f7x[] = {
 /*70*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*71*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*72*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*73*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*74*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*75*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*76*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*77*/	{ "",	   FALSE, NONE,  0,	      0 },
 
 /*78*/	{ "vmread", TRUE, NONE,  op2(Rq, E),  0 },
 /*79*/	{ "vmwrite",TRUE, NONE,  op2(E, Rq),  0 },
 /*7a*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*7b*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*7c*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*7d*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*7e*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*7f*/	{ "",	   FALSE, NONE,  0,	      0 },
 };
 
 static const struct inst db_inst_0f8x[] = {
 /*80*/	{ "jo",    FALSE, NONE,  op1(Dl),     0 },
 /*81*/	{ "jno",   FALSE, NONE,  op1(Dl),     0 },
 /*82*/	{ "jb",    FALSE, NONE,  op1(Dl),     0 },
 /*83*/	{ "jnb",   FALSE, NONE,  op1(Dl),     0 },
 /*84*/	{ "jz",    FALSE, NONE,  op1(Dl),     0 },
 /*85*/	{ "jnz",   FALSE, NONE,  op1(Dl),     0 },
 /*86*/	{ "jbe",   FALSE, NONE,  op1(Dl),     0 },
 /*87*/	{ "jnbe",  FALSE, NONE,  op1(Dl),     0 },
 
 /*88*/	{ "js",    FALSE, NONE,  op1(Dl),     0 },
 /*89*/	{ "jns",   FALSE, NONE,  op1(Dl),     0 },
 /*8a*/	{ "jp",    FALSE, NONE,  op1(Dl),     0 },
 /*8b*/	{ "jnp",   FALSE, NONE,  op1(Dl),     0 },
 /*8c*/	{ "jl",    FALSE, NONE,  op1(Dl),     0 },
 /*8d*/	{ "jnl",   FALSE, NONE,  op1(Dl),     0 },
 /*8e*/	{ "jle",   FALSE, NONE,  op1(Dl),     0 },
 /*8f*/	{ "jnle",  FALSE, NONE,  op1(Dl),     0 },
 };
 
 static const struct inst db_inst_0f9x[] = {
 /*90*/	{ "seto",  TRUE,  NONE,  op1(Eb),     0 },
 /*91*/	{ "setno", TRUE,  NONE,  op1(Eb),     0 },
 /*92*/	{ "setb",  TRUE,  NONE,  op1(Eb),     0 },
 /*93*/	{ "setnb", TRUE,  NONE,  op1(Eb),     0 },
 /*94*/	{ "setz",  TRUE,  NONE,  op1(Eb),     0 },
 /*95*/	{ "setnz", TRUE,  NONE,  op1(Eb),     0 },
 /*96*/	{ "setbe", TRUE,  NONE,  op1(Eb),     0 },
 /*97*/	{ "setnbe",TRUE,  NONE,  op1(Eb),     0 },
 
 /*98*/	{ "sets",  TRUE,  NONE,  op1(Eb),     0 },
 /*99*/	{ "setns", TRUE,  NONE,  op1(Eb),     0 },
 /*9a*/	{ "setp",  TRUE,  NONE,  op1(Eb),     0 },
 /*9b*/	{ "setnp", TRUE,  NONE,  op1(Eb),     0 },
 /*9c*/	{ "setl",  TRUE,  NONE,  op1(Eb),     0 },
 /*9d*/	{ "setnl", TRUE,  NONE,  op1(Eb),     0 },
 /*9e*/	{ "setle", TRUE,  NONE,  op1(Eb),     0 },
 /*9f*/	{ "setnle",TRUE,  NONE,  op1(Eb),     0 },
 };
 
 static const struct inst db_inst_0fax[] = {
 /*a0*/	{ "push",  FALSE, NONE,  op1(Si),     0 },
 /*a1*/	{ "pop",   FALSE, NONE,  op1(Si),     0 },
 /*a2*/	{ "cpuid", FALSE, NONE,  0,	      0 },
 /*a3*/	{ "bt",    TRUE,  LONG,  op2(R,E),    0 },
 /*a4*/	{ "shld",  TRUE,  LONG,  op3(Ib,R,E), 0 },
 /*a5*/	{ "shld",  TRUE,  LONG,  op3(CL,R,E), 0 },
 /*a6*/	{ "",      FALSE, NONE,  0,	      0 },
 /*a7*/	{ "",      FALSE, NONE,  0,	      0 },
 
 /*a8*/	{ "push",  FALSE, NONE,  op1(Si),     0 },
 /*a9*/	{ "pop",   FALSE, NONE,  op1(Si),     0 },
 /*aa*/	{ "rsm",   FALSE, NONE,  0,	      0 },
 /*ab*/	{ "bts",   TRUE,  LONG,  op2(R,E),    0 },
 /*ac*/	{ "shrd",  TRUE,  LONG,  op3(Ib,R,E), 0 },
 /*ad*/	{ "shrd",  TRUE,  LONG,  op3(CL,R,E), 0 },
 /*ae*/	{ "",      TRUE,  LONG,  op1(E),      db_Grp15 },
 /*af*/	{ "imul",  TRUE,  LONG,  op2(E,R),    0 },
 };
 
 static const struct inst db_inst_0fbx[] = {
 /*b0*/	{ "cmpxchg",TRUE, BYTE,	 op2(R, E),   0 },
 /*b0*/	{ "cmpxchg",TRUE, LONG,	 op2(R, E),   0 },
 /*b2*/	{ "lss",   TRUE,  LONG,  op2(E, R),   0 },
 /*b3*/	{ "btr",   TRUE,  LONG,  op2(R, E),   0 },
 /*b4*/	{ "lfs",   TRUE,  LONG,  op2(E, R),   0 },
 /*b5*/	{ "lgs",   TRUE,  LONG,  op2(E, R),   0 },
 /*b6*/	{ "movzb", TRUE,  LONG,  op2(Eb, R),  0 },
 /*b7*/	{ "movzw", TRUE,  LONG,  op2(Ew, R),  0 },
 
 /*b8*/	{ "",      FALSE, NONE,  0,	      0 },
 /*b9*/	{ "",      FALSE, NONE,  0,	      0 },
 /*ba*/	{ "",      TRUE,  LONG,  op2(Ib, E),  db_Grp8 },
 /*bb*/	{ "btc",   TRUE,  LONG,  op2(R, E),   0 },
 /*bc*/	{ "bsf",   TRUE,  LONG,  op2(E, R),   0 },
 /*bd*/	{ "bsr",   TRUE,  LONG,  op2(E, R),   0 },
 /*be*/	{ "movsb", TRUE,  LONG,  op2(Eb, R),  0 },
 /*bf*/	{ "movsw", TRUE,  LONG,  op2(Ew, R),  0 },
 };
 
 static const struct inst db_inst_0fcx[] = {
 /*c0*/	{ "xadd",  TRUE,  BYTE,	 op2(R, E),   0 },
 /*c1*/	{ "xadd",  TRUE,  LONG,	 op2(R, E),   0 },
 /*c2*/	{ "",	   FALSE, NONE,	 0,	      0 },
 /*c3*/	{ "",	   FALSE, NONE,	 0,	      0 },
 /*c4*/	{ "",	   FALSE, NONE,	 0,	      0 },
 /*c5*/	{ "",	   FALSE, NONE,	 0,	      0 },
 /*c6*/	{ "",	   FALSE, NONE,	 0,	      0 },
 /*c7*/	{ "",	   TRUE,  NONE,  op1(E),      db_Grp9 },
 /*c8*/	{ "bswap", FALSE, LONG,  op1(Ril),    0 },
 /*c9*/	{ "bswap", FALSE, LONG,  op1(Ril),    0 },
 /*ca*/	{ "bswap", FALSE, LONG,  op1(Ril),    0 },
 /*cb*/	{ "bswap", FALSE, LONG,  op1(Ril),    0 },
 /*cc*/	{ "bswap", FALSE, LONG,  op1(Ril),    0 },
 /*cd*/	{ "bswap", FALSE, LONG,  op1(Ril),    0 },
 /*ce*/	{ "bswap", FALSE, LONG,  op1(Ril),    0 },
 /*cf*/	{ "bswap", FALSE, LONG,  op1(Ril),    0 },
 };
 
 static const struct inst * const db_inst_0f[] = {
 	db_inst_0f0x,
 	db_inst_0f1x,
 	db_inst_0f2x,
 	db_inst_0f3x,
 	db_inst_0f4x,
 	0,
 	0,
 	db_inst_0f7x,
 	db_inst_0f8x,
 	db_inst_0f9x,
 	db_inst_0fax,
 	db_inst_0fbx,
 	db_inst_0fcx,
 	0,
 	0,
 	0
 };
 
 static const char * const db_Esc92[] = {
 	"fnop",	"",	"",	"",	"",	"",	"",	""
 };
 static const char * const db_Esc94[] = {
 	"fchs",	"fabs",	"",	"",	"ftst",	"fxam",	"",	""
 };
 static const char * const db_Esc95[] = {
 	"fld1",	"fldl2t","fldl2e","fldpi","fldlg2","fldln2","fldz",""
 };
 static const char * const db_Esc96[] = {
 	"f2xm1","fyl2x","fptan","fpatan","fxtract","fprem1","fdecstp",
 	"fincstp"
 };
 static const char * const db_Esc97[] = {
 	"fprem","fyl2xp1","fsqrt","fsincos","frndint","fscale","fsin","fcos"
 };
 
 static const char * const db_Esca5[] = {
 	"",	"fucompp","",	"",	"",	"",	"",	""
 };
 
 static const char * const db_Escb4[] = {
 	"fneni","fndisi",	"fnclex","fninit","fsetpm",	"",	"",	""
 };
 
 static const char * const db_Esce3[] = {
 	"",	"fcompp","",	"",	"",	"",	"",	""
 };
 
 static const char * const db_Escf4[] = {
 	"fnstsw","",	"",	"",	"",	"",	"",	""
 };
 
 static const struct finst db_Esc8[] = {
 /*0*/	{ "fadd",   SNGL,  op2(STI,ST),	0 },
 /*1*/	{ "fmul",   SNGL,  op2(STI,ST),	0 },
 /*2*/	{ "fcom",   SNGL,  op2(STI,ST),	0 },
 /*3*/	{ "fcomp",  SNGL,  op2(STI,ST),	0 },
 /*4*/	{ "fsub",   SNGL,  op2(STI,ST),	0 },
 /*5*/	{ "fsubr",  SNGL,  op2(STI,ST),	0 },
 /*6*/	{ "fdiv",   SNGL,  op2(STI,ST),	0 },
 /*7*/	{ "fdivr",  SNGL,  op2(STI,ST),	0 },
 };
 
 static const struct finst db_Esc9[] = {
 /*0*/	{ "fld",    SNGL,  op1(STI),	0 },
 /*1*/	{ "",       NONE,  op1(STI),	"fxch" },
 /*2*/	{ "fst",    SNGL,  op1(X),	db_Esc92 },
 /*3*/	{ "fstp",   SNGL,  0,		0 },
 /*4*/	{ "fldenv", NONE,  op1(X),	db_Esc94 },
 /*5*/	{ "fldcw",  NONE,  op1(X),	db_Esc95 },
 /*6*/	{ "fnstenv",NONE,  op1(X),	db_Esc96 },
 /*7*/	{ "fnstcw", NONE,  op1(X),	db_Esc97 },
 };
 
 static const struct finst db_Esca[] = {
 /*0*/	{ "fiadd",  LONG,  0,		0 },
 /*1*/	{ "fimul",  LONG,  0,		0 },
 /*2*/	{ "ficom",  LONG,  0,		0 },
 /*3*/	{ "ficomp", LONG,  0,		0 },
 /*4*/	{ "fisub",  LONG,  0,		0 },
 /*5*/	{ "fisubr", LONG,  op1(X),	db_Esca5 },
 /*6*/	{ "fidiv",  LONG,  0,		0 },
 /*7*/	{ "fidivr", LONG,  0,		0 }
 };
 
 static const struct finst db_Escb[] = {
 /*0*/	{ "fild",   LONG,  0,		0 },
 /*1*/	{ "",       NONE,  0,		0 },
 /*2*/	{ "fist",   LONG,  0,		0 },
 /*3*/	{ "fistp",  LONG,  0,		0 },
 /*4*/	{ "",       WORD,  op1(X),	db_Escb4 },
 /*5*/	{ "fld",    EXTR,  0,		0 },
 /*6*/	{ "",       WORD,  0,		0 },
 /*7*/	{ "fstp",   EXTR,  0,		0 },
 };
 
 static const struct finst db_Escc[] = {
 /*0*/	{ "fadd",   DBLR,  op2(ST,STI),	0 },
 /*1*/	{ "fmul",   DBLR,  op2(ST,STI),	0 },
 /*2*/	{ "fcom",   DBLR,  0,		0 },
 /*3*/	{ "fcomp",  DBLR,  0,		0 },
 /*4*/	{ "fsub",   DBLR,  op2(ST,STI),	"fsubr" },
 /*5*/	{ "fsubr",  DBLR,  op2(ST,STI),	"fsub" },
 /*6*/	{ "fdiv",   DBLR,  op2(ST,STI),	"fdivr" },
 /*7*/	{ "fdivr",  DBLR,  op2(ST,STI),	"fdiv" },
 };
 
 static const struct finst db_Escd[] = {
 /*0*/	{ "fld",    DBLR,  op1(STI),	"ffree" },
 /*1*/	{ "",       NONE,  0,		0 },
 /*2*/	{ "fst",    DBLR,  op1(STI),	0 },
 /*3*/	{ "fstp",   DBLR,  op1(STI),	0 },
 /*4*/	{ "frstor", NONE,  op1(STI),	"fucom" },
 /*5*/	{ "",       NONE,  op1(STI),	"fucomp" },
 /*6*/	{ "fnsave", NONE,  0,		0 },
 /*7*/	{ "fnstsw", NONE,  0,		0 },
 };
 
 static const struct finst db_Esce[] = {
 /*0*/	{ "fiadd",  WORD,  op2(ST,STI),	"faddp" },
 /*1*/	{ "fimul",  WORD,  op2(ST,STI),	"fmulp" },
 /*2*/	{ "ficom",  WORD,  0,		0 },
 /*3*/	{ "ficomp", WORD,  op1(X),	db_Esce3 },
 /*4*/	{ "fisub",  WORD,  op2(ST,STI),	"fsubrp" },
 /*5*/	{ "fisubr", WORD,  op2(ST,STI),	"fsubp" },
 /*6*/	{ "fidiv",  WORD,  op2(ST,STI),	"fdivrp" },
 /*7*/	{ "fidivr", WORD,  op2(ST,STI),	"fdivp" },
 };
 
 static const struct finst db_Escf[] = {
 /*0*/	{ "fild",   WORD,  0,		0 },
 /*1*/	{ "",       NONE,  0,		0 },
 /*2*/	{ "fist",   WORD,  0,		0 },
 /*3*/	{ "fistp",  WORD,  0,		0 },
 /*4*/	{ "fbld",   NONE,  op1(XA),	db_Escf4 },
 /*5*/	{ "fild",   QUAD,  0,		0 },
 /*6*/	{ "fbstp",  NONE,  0,		0 },
 /*7*/	{ "fistp",  QUAD,  0,		0 },
 };
 
 static const struct finst * const db_Esc_inst[] = {
 	db_Esc8, db_Esc9, db_Esca, db_Escb,
 	db_Escc, db_Escd, db_Esce, db_Escf
 };
 
 static const char * const db_Grp1[] = {
 	"add",
 	"or",
 	"adc",
 	"sbb",
 	"and",
 	"sub",
 	"xor",
 	"cmp"
 };
 
 static const char * const db_Grp2[] = {
 	"rol",
 	"ror",
 	"rcl",
 	"rcr",
 	"shl",
 	"shr",
 	"shl",
 	"sar"
 };
 
 static const struct inst db_Grp3[] = {
 	{ "test",  TRUE, NONE, op2(I,E), 0 },
 	{ "test",  TRUE, NONE, op2(I,E), 0 },
 	{ "not",   TRUE, NONE, op1(E),   0 },
 	{ "neg",   TRUE, NONE, op1(E),   0 },
 	{ "mul",   TRUE, NONE, op2(E,A), 0 },
 	{ "imul",  TRUE, NONE, op2(E,A), 0 },
 	{ "div",   TRUE, NONE, op2(E,A), 0 },
 	{ "idiv",  TRUE, NONE, op2(E,A), 0 },
 };
 
 static const struct inst db_Grp4[] = {
 	{ "inc",   TRUE, BYTE, op1(E),   0 },
 	{ "dec",   TRUE, BYTE, op1(E),   0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 }
 };
 
 static const struct inst db_Grp5[] = {
 	{ "inc",   TRUE, LONG, op1(E),   0 },
 	{ "dec",   TRUE, LONG, op1(E),   0 },
 	{ "call",  TRUE, LONG, op1(Eind),0 },
 	{ "lcall", TRUE, LONG, op1(Eind),0 },
 	{ "jmp",   TRUE, LONG, op1(Eind),0 },
 	{ "ljmp",  TRUE, LONG, op1(Eind),0 },
 	{ "push",  TRUE, LONG, op1(E),   0 },
 	{ "",      TRUE, NONE, 0,	 0 }
 };
 
 static const struct inst db_Grp9b[] = {
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "",      TRUE, NONE, 0,	 0 },
 	{ "rdrand",TRUE, LONG, op1(Rv),  0 },
 	{ "rdseed",TRUE, LONG, op1(Rv),  0 }
 };
 
 static const struct inst db_inst_table[256] = {
 /*00*/	{ "add",   TRUE,  BYTE,  op2(R, E),  0 },
 /*01*/	{ "add",   TRUE,  LONG,  op2(R, E),  0 },
 /*02*/	{ "add",   TRUE,  BYTE,  op2(E, R),  0 },
 /*03*/	{ "add",   TRUE,  LONG,  op2(E, R),  0 },
 /*04*/	{ "add",   FALSE, BYTE,  op2(I, A),  0 },
 /*05*/	{ "add",   FALSE, LONG,  op2(Is, A), 0 },
 /*06*/	{ "push",  FALSE, NONE,  op1(Si),    0 },
 /*07*/	{ "pop",   FALSE, NONE,  op1(Si),    0 },
 
 /*08*/	{ "or",    TRUE,  BYTE,  op2(R, E),  0 },
 /*09*/	{ "or",    TRUE,  LONG,  op2(R, E),  0 },
 /*0a*/	{ "or",    TRUE,  BYTE,  op2(E, R),  0 },
 /*0b*/	{ "or",    TRUE,  LONG,  op2(E, R),  0 },
 /*0c*/	{ "or",    FALSE, BYTE,  op2(I, A),  0 },
 /*0d*/	{ "or",    FALSE, LONG,  op2(I, A),  0 },
 /*0e*/	{ "push",  FALSE, NONE,  op1(Si),    0 },
 /*0f*/	{ "",      FALSE, ESC,   0,	     db_inst_0f },
 
 /*10*/	{ "adc",   TRUE,  BYTE,  op2(R, E),  0 },
 /*11*/	{ "adc",   TRUE,  LONG,  op2(R, E),  0 },
 /*12*/	{ "adc",   TRUE,  BYTE,  op2(E, R),  0 },
 /*13*/	{ "adc",   TRUE,  LONG,  op2(E, R),  0 },
 /*14*/	{ "adc",   FALSE, BYTE,  op2(I, A),  0 },
 /*15*/	{ "adc",   FALSE, LONG,  op2(Is, A), 0 },
 /*16*/	{ "push",  FALSE, NONE,  op1(Si),    0 },
 /*17*/	{ "pop",   FALSE, NONE,  op1(Si),    0 },
 
 /*18*/	{ "sbb",   TRUE,  BYTE,  op2(R, E),  0 },
 /*19*/	{ "sbb",   TRUE,  LONG,  op2(R, E),  0 },
 /*1a*/	{ "sbb",   TRUE,  BYTE,  op2(E, R),  0 },
 /*1b*/	{ "sbb",   TRUE,  LONG,  op2(E, R),  0 },
 /*1c*/	{ "sbb",   FALSE, BYTE,  op2(I, A),  0 },
 /*1d*/	{ "sbb",   FALSE, LONG,  op2(Is, A), 0 },
 /*1e*/	{ "push",  FALSE, NONE,  op1(Si),    0 },
 /*1f*/	{ "pop",   FALSE, NONE,  op1(Si),    0 },
 
 /*20*/	{ "and",   TRUE,  BYTE,  op2(R, E),  0 },
 /*21*/	{ "and",   TRUE,  LONG,  op2(R, E),  0 },
 /*22*/	{ "and",   TRUE,  BYTE,  op2(E, R),  0 },
 /*23*/	{ "and",   TRUE,  LONG,  op2(E, R),  0 },
 /*24*/	{ "and",   FALSE, BYTE,  op2(I, A),  0 },
 /*25*/	{ "and",   FALSE, LONG,  op2(I, A),  0 },
 /*26*/	{ "",      FALSE, NONE,  0,	     0 },
 /*27*/	{ "daa",   FALSE, NONE,  0,	     0 },
 
 /*28*/	{ "sub",   TRUE,  BYTE,  op2(R, E),  0 },
 /*29*/	{ "sub",   TRUE,  LONG,  op2(R, E),  0 },
 /*2a*/	{ "sub",   TRUE,  BYTE,  op2(E, R),  0 },
 /*2b*/	{ "sub",   TRUE,  LONG,  op2(E, R),  0 },
 /*2c*/	{ "sub",   FALSE, BYTE,  op2(I, A),  0 },
 /*2d*/	{ "sub",   FALSE, LONG,  op2(Is, A), 0 },
 /*2e*/	{ "",      FALSE, NONE,  0,	     0 },
 /*2f*/	{ "das",   FALSE, NONE,  0,	     0 },
 
 /*30*/	{ "xor",   TRUE,  BYTE,  op2(R, E),  0 },
 /*31*/	{ "xor",   TRUE,  LONG,  op2(R, E),  0 },
 /*32*/	{ "xor",   TRUE,  BYTE,  op2(E, R),  0 },
 /*33*/	{ "xor",   TRUE,  LONG,  op2(E, R),  0 },
 /*34*/	{ "xor",   FALSE, BYTE,  op2(I, A),  0 },
 /*35*/	{ "xor",   FALSE, LONG,  op2(I, A),  0 },
 /*36*/	{ "",      FALSE, NONE,  0,	     0 },
 /*37*/	{ "aaa",   FALSE, NONE,  0,	     0 },
 
 /*38*/	{ "cmp",   TRUE,  BYTE,  op2(R, E),  0 },
 /*39*/	{ "cmp",   TRUE,  LONG,  op2(R, E),  0 },
 /*3a*/	{ "cmp",   TRUE,  BYTE,  op2(E, R),  0 },
 /*3b*/	{ "cmp",   TRUE,  LONG,  op2(E, R),  0 },
 /*3c*/	{ "cmp",   FALSE, BYTE,  op2(I, A),  0 },
 /*3d*/	{ "cmp",   FALSE, LONG,  op2(Is, A), 0 },
 /*3e*/	{ "",      FALSE, NONE,  0,	     0 },
 /*3f*/	{ "aas",   FALSE, NONE,  0,	     0 },
 
 /*40*/	{ "rex",   FALSE, NONE,  0,          0 },
 /*41*/	{ "rex.b", FALSE, NONE,  0,          0 },
 /*42*/	{ "rex.x", FALSE, NONE,  0,          0 },
 /*43*/	{ "rex.xb", FALSE, NONE, 0,          0 },
 /*44*/	{ "rex.r", FALSE, NONE,  0,          0 },
 /*45*/	{ "rex.rb", FALSE, NONE, 0,          0 },
 /*46*/	{ "rex.rx", FALSE, NONE, 0,          0 },
 /*47*/	{ "rex.rxb", FALSE, NONE, 0,         0 },
 
 /*48*/	{ "rex.w", FALSE, NONE,  0,          0 },
 /*49*/	{ "rex.wb", FALSE, NONE, 0,          0 },
 /*4a*/	{ "rex.wx", FALSE, NONE, 0,          0 },
 /*4b*/	{ "rex.wxb", FALSE, NONE, 0,         0 },
 /*4c*/	{ "rex.wr", FALSE, NONE, 0,          0 },
 /*4d*/	{ "rex.wrb", FALSE, NONE, 0,         0 },
 /*4e*/	{ "rex.wrx", FALSE, NONE, 0,         0 },
 /*4f*/	{ "rex.wrxb", FALSE, NONE, 0,        0 },
 
 /*50*/	{ "push",  FALSE, LONG,  op1(Ri),    0 },
 /*51*/	{ "push",  FALSE, LONG,  op1(Ri),    0 },
 /*52*/	{ "push",  FALSE, LONG,  op1(Ri),    0 },
 /*53*/	{ "push",  FALSE, LONG,  op1(Ri),    0 },
 /*54*/	{ "push",  FALSE, LONG,  op1(Ri),    0 },
 /*55*/	{ "push",  FALSE, LONG,  op1(Ri),    0 },
 /*56*/	{ "push",  FALSE, LONG,  op1(Ri),    0 },
 /*57*/	{ "push",  FALSE, LONG,  op1(Ri),    0 },
 
 /*58*/	{ "pop",   FALSE, LONG,  op1(Ri),    0 },
 /*59*/	{ "pop",   FALSE, LONG,  op1(Ri),    0 },
 /*5a*/	{ "pop",   FALSE, LONG,  op1(Ri),    0 },
 /*5b*/	{ "pop",   FALSE, LONG,  op1(Ri),    0 },
 /*5c*/	{ "pop",   FALSE, LONG,  op1(Ri),    0 },
 /*5d*/	{ "pop",   FALSE, LONG,  op1(Ri),    0 },
 /*5e*/	{ "pop",   FALSE, LONG,  op1(Ri),    0 },
 /*5f*/	{ "pop",   FALSE, LONG,  op1(Ri),    0 },
 
 /*60*/	{ "pusha", FALSE, LONG,  0,	     0 },
 /*61*/	{ "popa",  FALSE, LONG,  0,	     0 },
 /*62*/  { "bound", TRUE,  LONG,  op2(E, R),  0 },
 /*63*/	{ "movslq",  TRUE,  NONE,  op2(EL,R), 0 },
 
 /*64*/	{ "",      FALSE, NONE,  0,	     0 },
 /*65*/	{ "",      FALSE, NONE,  0,	     0 },
 /*66*/	{ "",      FALSE, NONE,  0,	     0 },
 /*67*/	{ "",      FALSE, NONE,  0,	     0 },
 
 /*68*/	{ "push",  FALSE, LONG,  op1(I),     0 },
 /*69*/  { "imul",  TRUE,  LONG,  op3(I,E,R), 0 },
 /*6a*/	{ "push",  FALSE, LONG,  op1(Ibs),   0 },
 /*6b*/  { "imul",  TRUE,  LONG,  op3(Ibs,E,R),0 },
 /*6c*/	{ "ins",   FALSE, BYTE,  op2(DX, DI), 0 },
 /*6d*/	{ "ins",   FALSE, LONG,  op2(DX, DI), 0 },
 /*6e*/	{ "outs",  FALSE, BYTE,  op2(SI, DX), 0 },
 /*6f*/	{ "outs",  FALSE, LONG,  op2(SI, DX), 0 },
 
 /*70*/	{ "jo",    FALSE, NONE,  op1(Db),     0 },
 /*71*/	{ "jno",   FALSE, NONE,  op1(Db),     0 },
 /*72*/	{ "jb",    FALSE, NONE,  op1(Db),     0 },
 /*73*/	{ "jnb",   FALSE, NONE,  op1(Db),     0 },
 /*74*/	{ "jz",    FALSE, NONE,  op1(Db),     0 },
 /*75*/	{ "jnz",   FALSE, NONE,  op1(Db),     0 },
 /*76*/	{ "jbe",   FALSE, NONE,  op1(Db),     0 },
 /*77*/	{ "jnbe",  FALSE, NONE,  op1(Db),     0 },
 
 /*78*/	{ "js",    FALSE, NONE,  op1(Db),     0 },
 /*79*/	{ "jns",   FALSE, NONE,  op1(Db),     0 },
 /*7a*/	{ "jp",    FALSE, NONE,  op1(Db),     0 },
 /*7b*/	{ "jnp",   FALSE, NONE,  op1(Db),     0 },
 /*7c*/	{ "jl",    FALSE, NONE,  op1(Db),     0 },
 /*7d*/	{ "jnl",   FALSE, NONE,  op1(Db),     0 },
 /*7e*/	{ "jle",   FALSE, NONE,  op1(Db),     0 },
 /*7f*/	{ "jnle",  FALSE, NONE,  op1(Db),     0 },
 
 /*80*/  { "",	   TRUE,  BYTE,  op2(I, E),   db_Grp1 },
 /*81*/  { "",	   TRUE,  LONG,  op2(I, E),   db_Grp1 },
 /*82*/  { "",	   TRUE,  BYTE,  op2(I, E),   db_Grp1 },
 /*83*/  { "",	   TRUE,  LONG,  op2(Ibs,E),  db_Grp1 },
 /*84*/	{ "test",  TRUE,  BYTE,  op2(R, E),   0 },
 /*85*/	{ "test",  TRUE,  LONG,  op2(R, E),   0 },
 /*86*/	{ "xchg",  TRUE,  BYTE,  op2(R, E),   0 },
 /*87*/	{ "xchg",  TRUE,  LONG,  op2(R, E),   0 },
 
 /*88*/	{ "mov",   TRUE,  BYTE,  op2(R, E),   0 },
 /*89*/	{ "mov",   TRUE,  LONG,  op2(R, E),   0 },
 /*8a*/	{ "mov",   TRUE,  BYTE,  op2(E, R),   0 },
 /*8b*/	{ "mov",   TRUE,  LONG,  op2(E, R),   0 },
 /*8c*/  { "mov",   TRUE,  NONE,  op2(S, Ew),  0 },
 /*8d*/	{ "lea",   TRUE,  LONG,  op2(E, R),   0 },
 /*8e*/	{ "mov",   TRUE,  NONE,  op2(Ew, S),  0 },
 /*8f*/	{ "pop",   TRUE,  LONG,  op1(E),      0 },
 
 /*90*/	{ "nop",   FALSE, NONE,  0,	      0 },
 /*91*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 /*92*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 /*93*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 /*94*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 /*95*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 /*96*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 /*97*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 
 /*98*/	{ "cwde",  FALSE, SDEP,  0,	      "cbw" },
 /*99*/	{ "cdq",   FALSE, SDEP,  0,	      "cwd" },
 /*9a*/	{ "lcall", FALSE, NONE,  op1(OS),     0 },
 /*9b*/	{ "wait",  FALSE, NONE,  0,	      0 },
 /*9c*/	{ "pushf", FALSE, LONG,  0,	      0 },
 /*9d*/	{ "popf",  FALSE, LONG,  0,	      0 },
 /*9e*/	{ "sahf",  FALSE, NONE,  0,	      0 },
 /*9f*/	{ "lahf",  FALSE, NONE,  0,	      0 },
 
 /*a0*/	{ "mov",   FALSE, BYTE,  op2(O, A),   0 },
 /*a1*/	{ "mov",   FALSE, LONG,  op2(O, A),   0 },
 /*a2*/	{ "mov",   FALSE, BYTE,  op2(A, O),   0 },
 /*a3*/	{ "mov",   FALSE, LONG,  op2(A, O),   0 },
 /*a4*/	{ "movs",  FALSE, BYTE,  op2(SI,DI),  0 },
 /*a5*/	{ "movs",  FALSE, LONG,  op2(SI,DI),  0 },
 /*a6*/	{ "cmps",  FALSE, BYTE,  op2(SI,DI),  0 },
 /*a7*/	{ "cmps",  FALSE, LONG,  op2(SI,DI),  0 },
 
 /*a8*/	{ "test",  FALSE, BYTE,  op2(I, A),   0 },
 /*a9*/	{ "test",  FALSE, LONG,  op2(I, A),   0 },
 /*aa*/	{ "stos",  FALSE, BYTE,  op1(DI),     0 },
 /*ab*/	{ "stos",  FALSE, LONG,  op1(DI),     0 },
 /*ac*/	{ "lods",  FALSE, BYTE,  op1(SI),     0 },
 /*ad*/	{ "lods",  FALSE, LONG,  op1(SI),     0 },
 /*ae*/	{ "scas",  FALSE, BYTE,  op1(SI),     0 },
 /*af*/	{ "scas",  FALSE, LONG,  op1(SI),     0 },
 
 /*b0*/	{ "mov",   FALSE, BYTE,  op2(I, Ri),  0 },
 /*b1*/	{ "mov",   FALSE, BYTE,  op2(I, Ri),  0 },
 /*b2*/	{ "mov",   FALSE, BYTE,  op2(I, Ri),  0 },
 /*b3*/	{ "mov",   FALSE, BYTE,  op2(I, Ri),  0 },
 /*b4*/	{ "mov",   FALSE, BYTE,  op2(I, Ri),  0 },
 /*b5*/	{ "mov",   FALSE, BYTE,  op2(I, Ri),  0 },
 /*b6*/	{ "mov",   FALSE, BYTE,  op2(I, Ri),  0 },
 /*b7*/	{ "mov",   FALSE, BYTE,  op2(I, Ri),  0 },
 
 /*b8*/	{ "mov",   FALSE, LONG,  op2(Ilq, Ri),  0 },
 /*b9*/	{ "mov",   FALSE, LONG,  op2(Ilq, Ri),  0 },
 /*ba*/	{ "mov",   FALSE, LONG,  op2(Ilq, Ri),  0 },
 /*bb*/	{ "mov",   FALSE, LONG,  op2(Ilq, Ri),  0 },
 /*bc*/	{ "mov",   FALSE, LONG,  op2(Ilq, Ri),  0 },
 /*bd*/	{ "mov",   FALSE, LONG,  op2(Ilq, Ri),  0 },
 /*be*/	{ "mov",   FALSE, LONG,  op2(Ilq, Ri),  0 },
 /*bf*/	{ "mov",   FALSE, LONG,  op2(Ilq, Ri),  0 },
 
 /*c0*/	{ "",	   TRUE,  BYTE,  op2(Ib, E),  db_Grp2 },
 /*c1*/	{ "",	   TRUE,  LONG,  op2(Ib, E),  db_Grp2 },
 /*c2*/	{ "ret",   FALSE, NONE,  op1(Iw),     0 },
 /*c3*/	{ "ret",   FALSE, NONE,  0,	      0 },
 /*c4*/	{ "les",   TRUE,  LONG,  op2(E, R),   0 },
 /*c5*/	{ "lds",   TRUE,  LONG,  op2(E, R),   0 },
 /*c6*/	{ "mov",   TRUE,  BYTE,  op2(I, E),   0 },
 /*c7*/	{ "mov",   TRUE,  LONG,  op2(I, E),   0 },
 
 /*c8*/	{ "enter", FALSE, NONE,  op2(Iw, Ib), 0 },
 /*c9*/	{ "leave", FALSE, NONE,  0,           0 },
 /*ca*/	{ "lret",  FALSE, NONE,  op1(Iw),     0 },
 /*cb*/	{ "lret",  FALSE, NONE,  0,	      0 },
 /*cc*/	{ "int",   FALSE, NONE,  op1(o3),     0 },
 /*cd*/	{ "int",   FALSE, NONE,  op1(Ib),     0 },
 /*ce*/	{ "into",  FALSE, NONE,  0,	      0 },
 /*cf*/	{ "iret",  FALSE, NONE,  0,	      0 },
 
 /*d0*/	{ "",	   TRUE,  BYTE,  op2(o1, E),  db_Grp2 },
 /*d1*/	{ "",	   TRUE,  LONG,  op2(o1, E),  db_Grp2 },
 /*d2*/	{ "",	   TRUE,  BYTE,  op2(CL, E),  db_Grp2 },
 /*d3*/	{ "",	   TRUE,  LONG,  op2(CL, E),  db_Grp2 },
 /*d4*/	{ "aam",   FALSE, NONE,  op1(Iba),    0 },
 /*d5*/	{ "aad",   FALSE, NONE,  op1(Iba),    0 },
 /*d6*/	{ ".byte\t0xd6", FALSE, NONE, 0,      0 },
 /*d7*/	{ "xlat",  FALSE, BYTE,  op1(BX),     0 },
 
 /*d8*/  { "",      TRUE,  NONE,  0,	      db_Esc8 },
 /*d9*/  { "",      TRUE,  NONE,  0,	      db_Esc9 },
 /*da*/  { "",      TRUE,  NONE,  0,	      db_Esca },
 /*db*/  { "",      TRUE,  NONE,  0,	      db_Escb },
 /*dc*/  { "",      TRUE,  NONE,  0,	      db_Escc },
 /*dd*/  { "",      TRUE,  NONE,  0,	      db_Escd },
 /*de*/  { "",      TRUE,  NONE,  0,	      db_Esce },
 /*df*/  { "",      TRUE,  NONE,  0,	      db_Escf },
 
 /*e0*/	{ "loopne",FALSE, NONE,  op1(Db),     0 },
 /*e1*/	{ "loope", FALSE, NONE,  op1(Db),     0 },
 /*e2*/	{ "loop",  FALSE, NONE,  op1(Db),     0 },
 /*e3*/	{ "jrcxz", FALSE, ADEP,  op1(Db),     "jecxz" },
 /*e4*/	{ "in",    FALSE, BYTE,  op2(Ib, A),  0 },
 /*e5*/	{ "in",    FALSE, LONG,  op2(Ib, A) , 0 },
 /*e6*/	{ "out",   FALSE, BYTE,  op2(A, Ib),  0 },
 /*e7*/	{ "out",   FALSE, LONG,  op2(A, Ib) , 0 },
 
 /*e8*/	{ "call",  FALSE, NONE,  op1(Dl),     0 },
 /*e9*/	{ "jmp",   FALSE, NONE,  op1(Dl),     0 },
 /*ea*/	{ "ljmp",  FALSE, NONE,  op1(OS),     0 },
 /*eb*/	{ "jmp",   FALSE, NONE,  op1(Db),     0 },
 /*ec*/	{ "in",    FALSE, BYTE,  op2(DX, A),  0 },
 /*ed*/	{ "in",    FALSE, LONG,  op2(DX, A) , 0 },
 /*ee*/	{ "out",   FALSE, BYTE,  op2(A, DX),  0 },
 /*ef*/	{ "out",   FALSE, LONG,  op2(A, DX) , 0 },
 
 /*f0*/	{ "",      FALSE, NONE,  0,	     0 },
 /*f1*/	{ ".byte\t0xf1", FALSE, NONE, 0,     0 },
 /*f2*/	{ "",      FALSE, NONE,  0,	     0 },
 /*f3*/	{ "",      FALSE, NONE,  0,	     0 },
 /*f4*/	{ "hlt",   FALSE, NONE,  0,	     0 },
 /*f5*/	{ "cmc",   FALSE, NONE,  0,	     0 },
 /*f6*/	{ "",      TRUE,  BYTE,  0,	     db_Grp3 },
 /*f7*/	{ "",	   TRUE,  LONG,  0,	     db_Grp3 },
 
 /*f8*/	{ "clc",   FALSE, NONE,  0,	     0 },
 /*f9*/	{ "stc",   FALSE, NONE,  0,	     0 },
 /*fa*/	{ "cli",   FALSE, NONE,  0,	     0 },
 /*fb*/	{ "sti",   FALSE, NONE,  0,	     0 },
 /*fc*/	{ "cld",   FALSE, NONE,  0,	     0 },
 /*fd*/	{ "std",   FALSE, NONE,  0,	     0 },
 /*fe*/	{ "",	   TRUE,  NONE,  0,	     db_Grp4 },
 /*ff*/	{ "",	   TRUE,  NONE,  0,	     db_Grp5 },
 };
 
 static const struct inst db_bad_inst =
 	{ "???",   FALSE, NONE,  0,	      0 }
 ;
 
 #define	f_mod(rex, byte)	((byte)>>6)
 #define	f_reg(rex, byte)	((((byte)>>3)&0x7) | (rex & REX_R ? 0x8 : 0x0))
 #define	f_rm(rex, byte)		(((byte)&0x7) | (rex & REX_B ? 0x8 : 0x0))
 
 #define	sib_ss(rex, byte)	((byte)>>6)
 #define	sib_index(rex, byte)	((((byte)>>3)&0x7) | (rex & REX_X ? 0x8 : 0x0))
 #define	sib_base(rex, byte)	(((byte)&0x7) | (rex & REX_B ? 0x8 : 0x0))
 
 struct i_addr {
 	int		is_reg;	/* if reg, reg number is in 'disp' */
 	int		disp;
 	const char *	base;
 	const char *	index;
 	int		ss;
 };
 
 static const char * const db_reg[2][4][16] = {
 
 	{{"%al",  "%cl",  "%dl",  "%bl",  "%ah",  "%ch",  "%dh",  "%bh",
 	  "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" },
 	{ "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
 	  "%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w" },
 	{ "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
 	  "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" },
 	{ "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
 	  "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" }},
 
 	{{"%al",  "%cl",  "%dl",  "%bl",  "%spl",  "%bpl",  "%sil",  "%dil",
 	  "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" },
 	{ "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
 	  "%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w" },
 	{ "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
 	  "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" },
 	{ "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
 	  "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" }}
 };
 
 static const char * const db_seg_reg[8] = {
 	"%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", ""
 };
 
 /*
  * lengths for size attributes
  */
 static const int db_lengths[] = {
 	1,	/* BYTE */
 	2,	/* WORD */
 	4,	/* LONG */
 	8,	/* QUAD */
 	4,	/* SNGL */
 	8,	/* DBLR */
 	10,	/* EXTR */
 };
 
 #define	get_value_inc(result, loc, size, is_signed) \
 	result = db_get_value((loc), (size), (is_signed)); \
 	(loc) += (size);
 
 static db_addr_t
 		db_disasm_esc(db_addr_t loc, int inst, int rex, int short_addr,
 		    int size, const char *seg);
 static void	db_print_address(const char *seg, int size, int rex,
 		    struct i_addr *addrp);
 static db_addr_t
 		db_read_address(db_addr_t loc, int short_addr, int rex, int regmodrm,
 		    struct i_addr *addrp);
 
 /*
  * Read address at location and return updated location.
  */
 static db_addr_t
 db_read_address(loc, short_addr, rex, regmodrm, addrp)
 	db_addr_t	loc;
 	int		short_addr;
 	int		rex;
 	int		regmodrm;
 	struct i_addr *	addrp;		/* out */
 {
 	int		mod, rm, sib, index, disp, size, have_sib;
 
 	mod = f_mod(rex, regmodrm);
 	rm  = f_rm(rex, regmodrm);
 
 	if (mod == 3) {
 	    addrp->is_reg = TRUE;
 	    addrp->disp = rm;
 	    return (loc);
 	}
 	addrp->is_reg = FALSE;
-	addrp->index = 0;
+	addrp->index = NULL;
 
 	if (short_addr)
 	    size = LONG;
 	else
 	    size = QUAD;
 
 	if ((rm & 0x7) == 4) {
 	    get_value_inc(sib, loc, 1, FALSE);
 	    rm = sib_base(rex, sib);
 	    index = sib_index(rex, sib);
 	    if (index != 4)
 		addrp->index = db_reg[1][size][index];
 	    addrp->ss = sib_ss(rex, sib);
 	    have_sib = 1;
 	} else
 	    have_sib = 0;
 
 	switch (mod) {
 	    case 0:
 		if (rm == 5) {
 		    get_value_inc(addrp->disp, loc, 4, FALSE);
 		    if (have_sib)
-			addrp->base = 0;
+			addrp->base = NULL;
 		    else if (short_addr)
 			addrp->base = "%eip";
 		    else
 			addrp->base = "%rip";
 		} else {
 		    addrp->disp = 0;
 		    addrp->base = db_reg[1][size][rm];
 		}
 		break;
 
 	    case 1:
 		get_value_inc(disp, loc, 1, TRUE);
 		addrp->disp = disp;
 		addrp->base = db_reg[1][size][rm];
 		break;
 
 	    case 2:
 		get_value_inc(disp, loc, 4, FALSE);
 		addrp->disp = disp;
 		addrp->base = db_reg[1][size][rm];
 		break;
 	}
 	return (loc);
 }
 
 static void
 db_print_address(seg, size, rex, addrp)
 	const char *	seg;
 	int		size;
 	int		rex;
 	struct i_addr *	addrp;
 {
 	if (addrp->is_reg) {
 	    db_printf("%s", db_reg[rex != 0 ? 1 : 0][(size == LONG && (rex & REX_W)) ? QUAD : size][addrp->disp]);
 	    return;
 	}
 
 	if (seg) {
 	    db_printf("%s:", seg);
 	}
 
-	if (addrp->disp != 0 || (addrp->base == 0 && addrp->index == 0))
+	if (addrp->disp != 0 || (addrp->base == NULL && addrp->index == NULL))
 		db_printsym((db_addr_t)addrp->disp, DB_STGY_ANY);
-	if (addrp->base != 0 || addrp->index != 0) {
+	if (addrp->base != NULL || addrp->index != NULL) {
 	    db_printf("(");
 	    if (addrp->base)
 		db_printf("%s", addrp->base);
 	    if (addrp->index)
 		db_printf(",%s,%d", addrp->index, 1<<addrp->ss);
 	    db_printf(")");
 	}
 }
 
 /*
  * Disassemble floating-point ("escape") instruction
  * and return updated location.
  */
 static db_addr_t
 db_disasm_esc(loc, inst, rex, short_addr, size, seg)
 	db_addr_t	loc;
 	int		inst;
 	int		rex;
 	int		short_addr;
 	int		size;
 	const char *	seg;
 {
 	int		regmodrm;
 	const struct finst *	fp;
 	int		mod;
 	struct i_addr	address;
 	const char *	name;
 
 	get_value_inc(regmodrm, loc, 1, FALSE);
 	fp = &db_Esc_inst[inst - 0xd8][f_reg(rex, regmodrm)];
 	mod = f_mod(rex, regmodrm);
 	if (mod != 3) {
 	    if (*fp->f_name == '\0') {
 		db_printf("<bad instruction>");
 		return (loc);
 	    }
 	    /*
 	     * Normal address modes.
 	     */
 	    loc = db_read_address(loc, short_addr, rex, regmodrm, &address);
 	    db_printf("%s", fp->f_name);
 	    switch(fp->f_size) {
 		case SNGL:
 		    db_printf("s");
 		    break;
 		case DBLR:
 		    db_printf("l");
 		    break;
 		case EXTR:
 		    db_printf("t");
 		    break;
 		case WORD:
 		    db_printf("s");
 		    break;
 		case LONG:
 		    db_printf("l");
 		    break;
 		case QUAD:
 		    db_printf("q");
 		    break;
 		default:
 		    break;
 	    }
 	    db_printf("\t");
 	    db_print_address(seg, BYTE, rex, &address);
 	}
 	else {
 	    /*
 	     * 'reg-reg' - special formats
 	     */
 	    switch (fp->f_rrmode) {
 		case op2(ST,STI):
 		    name = (fp->f_rrname) ? fp->f_rrname : fp->f_name;
 		    db_printf("%s\t%%st,%%st(%d)",name,f_rm(rex, regmodrm));
 		    break;
 		case op2(STI,ST):
 		    name = (fp->f_rrname) ? fp->f_rrname : fp->f_name;
 		    db_printf("%s\t%%st(%d),%%st",name, f_rm(rex, regmodrm));
 		    break;
 		case op1(STI):
 		    name = (fp->f_rrname) ? fp->f_rrname : fp->f_name;
 		    db_printf("%s\t%%st(%d)",name, f_rm(rex, regmodrm));
 		    break;
 		case op1(X):
 		    name = ((const char * const *)fp->f_rrname)[f_rm(rex, regmodrm)];
 		    if (*name == '\0')
 			goto bad;
 		    db_printf("%s", name);
 		    break;
 		case op1(XA):
 		    name = ((const char * const *)fp->f_rrname)[f_rm(rex, regmodrm)];
 		    if (*name == '\0')
 			goto bad;
 		    db_printf("%s\t%%ax", name);
 		    break;
 		default:
 		bad:
 		    db_printf("<bad instruction>");
 		    break;
 	    }
 	}
 
 	return (loc);
 }
 
 /*
  * Disassemble instruction at 'loc'.  'altfmt' specifies an
  * (optional) alternate format.  Return address of start of
  * next instruction.
  */
 db_addr_t
 db_disasm(db_addr_t loc, bool altfmt)
 {
 	int	inst;
 	int	size;
 	int	short_addr;
 	const char *	seg;
 	const struct inst *	ip;
 	const char *	i_name;
 	int	i_size;
 	int	i_mode;
 	int	rex = 0;
 	int	regmodrm = 0;
 	boolean_t	first;
 	int	displ;
 	int	prefix;
 	int	rep;
 	int	imm;
 	int	imm2;
 	long	imm64;
 	int	len;
 	struct i_addr	address;
 
 	get_value_inc(inst, loc, 1, FALSE);
 	short_addr = FALSE;
 	size = LONG;
-	seg = 0;
+	seg = NULL;
 
 	/*
 	 * Get prefixes
 	 */
 	rep = FALSE;
 	prefix = TRUE;
 	do {
 	    switch (inst) {
 		case 0x66:		/* data16 */
 		    size = WORD;
 		    break;
 		case 0x67:
 		    short_addr = TRUE;
 		    break;
 		case 0x26:
 		    seg = "%es";
 		    break;
 		case 0x36:
 		    seg = "%ss";
 		    break;
 		case 0x2e:
 		    seg = "%cs";
 		    break;
 		case 0x3e:
 		    seg = "%ds";
 		    break;
 		case 0x64:
 		    seg = "%fs";
 		    break;
 		case 0x65:
 		    seg = "%gs";
 		    break;
 		case 0xf0:
 		    db_printf("lock ");
 		    break;
 		case 0xf2:
 		    db_printf("repne ");
 		    break;
 		case 0xf3:
 		    rep = TRUE;
 		    break;
 		default:
 		    prefix = FALSE;
 		    break;
 	    }
 	    if (inst >= 0x40 && inst < 0x50) {
 		rex = inst;
 		prefix = TRUE;
 	    }
 	    if (prefix) {
 		get_value_inc(inst, loc, 1, FALSE);
 	    }
 	} while (prefix);
 
 	if (inst >= 0xd8 && inst <= 0xdf) {
 	    loc = db_disasm_esc(loc, inst, rex, short_addr, size, seg);
 	    db_printf("\n");
 	    return (loc);
 	}
 
 	ip = &db_inst_table[inst];
 	while (ip->i_size == ESC) {
 	    get_value_inc(inst, loc, 1, FALSE);
 	    ip = ((const struct inst * const *)ip->i_extra)[inst>>4];
-	    if (ip == 0) {
+	    if (ip == NULL) {
 		ip = &db_bad_inst;
 	    }
 	    else {
 		ip = &ip[inst&0xf];
 	    }
 	}
 
 	if (ip->i_has_modrm) {
 	    get_value_inc(regmodrm, loc, 1, FALSE);
 	    loc = db_read_address(loc, short_addr, rex, regmodrm, &address);
 	}
 
 	i_name = ip->i_name;
 	i_size = ip->i_size;
 	i_mode = ip->i_mode;
 
 	if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) == 3) {
 	    ip = &db_Grp9b[f_reg(rex, regmodrm)];
 	    i_name = ip->i_name;
 	    i_size = ip->i_size;
 	    i_mode = ip->i_mode;
 	}
 	else if (ip->i_extra == db_Grp1 || ip->i_extra == db_Grp2 ||
 	    ip->i_extra == db_Grp6 || ip->i_extra == db_Grp7 ||
 	    ip->i_extra == db_Grp8 || ip->i_extra == db_Grp9 ||
 	    ip->i_extra == db_Grp15) {
 	    i_name = ((const char * const *)ip->i_extra)[f_reg(rex, regmodrm)];
 	}
 	else if (ip->i_extra == db_Grp3) {
 	    ip = ip->i_extra;
 	    ip = &ip[f_reg(rex, regmodrm)];
 	    i_name = ip->i_name;
 	    i_mode = ip->i_mode;
 	}
 	else if (ip->i_extra == db_Grp4 || ip->i_extra == db_Grp5) {
 	    ip = ip->i_extra;
 	    ip = &ip[f_reg(rex, regmodrm)];
 	    i_name = ip->i_name;
 	    i_mode = ip->i_mode;
 	    i_size = ip->i_size;
 	}
 
 	/* Special cases that don't fit well in the tables. */
 	if (ip->i_extra == db_Grp7 && f_mod(rex, regmodrm) == 3) {
 		switch (regmodrm) {
 		case 0xc1:
 			i_name = "vmcall";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xc2:
 			i_name = "vmlaunch";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xc3:
 			i_name = "vmresume";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xc4:
 			i_name = "vmxoff";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xc8:
 			i_name = "monitor";
 			i_size = NONE;
 			i_mode = 0;			
 			break;
 		case 0xc9:
 			i_name = "mwait";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xca:
 			i_name = "clac";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xcb:
 			i_name = "stac";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xd0:
 			i_name = "xgetbv";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xd1:
 			i_name = "xsetbv";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xd8:
 			i_name = "vmrun";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xd9:
 			i_name = "vmmcall";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xda:
 			i_name = "vmload";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xdb:
 			i_name = "vmsave";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xdc:
 			i_name = "stgi";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xdd:
 			i_name = "clgi";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xde:
 			i_name = "skinit";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xdf:
 			i_name = "invlpga";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xf8:
 			i_name = "swapgs";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		case 0xf9:
 			i_name = "rdtscp";
 			i_size = NONE;
 			i_mode = 0;
 			break;
 		}
 	}
 	if (ip->i_extra == db_Grp15 && f_mod(rex, regmodrm) == 3) {
 		i_name = db_Grp15b[f_reg(rex, regmodrm)];
 		i_size = NONE;
 		i_mode = 0;
 	}
 
 	/* Handle instructions identified by mandatory prefixes. */
 	if (rep == TRUE) {
 	    if (inst == 0x90) {
 		i_name = "pause";
 		i_size = NONE;
 		i_mode = 0;
 		rep = FALSE;
 	    } else if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) != 3 &&
 		f_reg(rex, regmodrm) == 0x6) {
 		i_name = "vmxon";
 		rep = FALSE;
 	    }
 	}
 	if (size == WORD) {
 	    if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) != 3 &&
 		f_reg(rex, regmodrm) == 0x6) {
 		i_name = "vmclear";
 	    }
 	}
 	if (rex & REX_W) {
 	    if (strcmp(i_name, "cwde") == 0)
 		i_name = "cdqe";
 	    else if (strcmp(i_name, "cmpxchg8b") == 0)
 		i_name = "cmpxchg16b";
 	}
 
 	if (rep == TRUE)
 	    db_printf("repe ");	/* XXX repe VS rep */
 
 	if (i_size == SDEP) {
 	    if (size == LONG)
 		db_printf("%s", i_name);
 	    else
 		db_printf("%s", (const char *)ip->i_extra);
 	} else if (i_size == ADEP) {
 	    if (short_addr == FALSE)
 		db_printf("%s", i_name);
 	    else
 		db_printf("%s", (const char *)ip->i_extra);
 	}
 	else {
 	    db_printf("%s", i_name);
 	    if ((inst >= 0x50 && inst <= 0x5f) || inst == 0x68 || inst == 0x6a) {
 		i_size = NONE;
 		db_printf("q");
 	    }
 	    if (i_size != NONE) {
 		if (i_size == BYTE) {
 		    db_printf("b");
 		    size = BYTE;
 		}
 		else if (i_size == WORD) {
 		    db_printf("w");
 		    size = WORD;
 		}
 		else if (size == WORD)
 		    db_printf("w");
 		else {
 		    if (rex & REX_W)
 			db_printf("q");
 		    else
 			db_printf("l");
 		}
 	    }
 	}
 	db_printf("\t");
 	for (first = TRUE;
 	     i_mode != 0;
 	     i_mode >>= 8, first = FALSE)
 	{
 	    if (!first)
 		db_printf(",");
 
 	    switch (i_mode & 0xFF) {
 
 		case E:
 		    db_print_address(seg, size, rex, &address);
 		    break;
 
 		case Eind:
 		    db_printf("*");
 		    db_print_address(seg, size, rex, &address);
 		    break;
 
 		case El:
 		    db_print_address(seg, (rex & REX_W) ? QUAD : LONG, rex, &address);
 		    break;
 
 		case EL:
 		    db_print_address(seg, LONG, 0, &address);
 		    break;
 
 		case Ew:
 		    db_print_address(seg, WORD, rex, &address);
 		    break;
 
 		case Eb:
 		    db_print_address(seg, BYTE, rex, &address);
 		    break;
 
 		case R:
 		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][(size == LONG && (rex & REX_W)) ? QUAD : size][f_reg(rex, regmodrm)]);
 		    break;
 
 		case Rw:
 		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][WORD][f_reg(rex, regmodrm)]);
 		    break;
 
 		case Rq:
 		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][QUAD][f_reg(rex, regmodrm)]);
 		    break;
 
 		case Ri:
 		    db_printf("%s", db_reg[0][QUAD][f_rm(rex, inst)]);
 		    break;
 
 		case Ril:
 		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][(rex & REX_R) ? QUAD : LONG][f_rm(rex, inst)]);
 		    break;
 
 	        case Rv:
 		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][(size == LONG && (rex & REX_W)) ? QUAD : size][f_rm(rex, regmodrm)]);
 		    break;
 
 		case S:
 		    db_printf("%s", db_seg_reg[f_reg(rex, regmodrm)]);
 		    break;
 
 		case Si:
 		    db_printf("%s", db_seg_reg[f_reg(rex, inst)]);
 		    break;
 
 		case A:
 		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][size][0]);	/* acc */
 		    break;
 
 		case BX:
 		    if (seg)
 			db_printf("%s:", seg);
 		    db_printf("(%s)", short_addr ? "%bx" : "%ebx");
 		    break;
 
 		case CL:
 		    db_printf("%%cl");
 		    break;
 
 		case DX:
 		    db_printf("%%dx");
 		    break;
 
 		case SI:
 		    if (seg)
 			db_printf("%s:", seg);
 		    db_printf("(%s)", short_addr ? "%si" : "%rsi");
 		    break;
 
 		case DI:
 		    db_printf("%%es:(%s)", short_addr ? "%di" : "%rdi");
 		    break;
 
 		case CR:
 		    db_printf("%%cr%d", f_reg(rex, regmodrm));
 		    break;
 
 		case DR:
 		    db_printf("%%dr%d", f_reg(rex, regmodrm));
 		    break;
 
 		case TR:
 		    db_printf("%%tr%d", f_reg(rex, regmodrm));
 		    break;
 
 		case I:
 		    len = db_lengths[size];
 		    get_value_inc(imm, loc, len, FALSE);
 		    db_printf("$%#r", imm);
 		    break;
 
 		case Is:
 		    len = db_lengths[(size == LONG && (rex & REX_W)) ? QUAD : size];
 		    get_value_inc(imm, loc, len, FALSE);
 		    db_printf("$%+#r", imm);
 		    break;
 
 		case Ib:
 		    get_value_inc(imm, loc, 1, FALSE);
 		    db_printf("$%#r", imm);
 		    break;
 
 		case Iba:
 		    get_value_inc(imm, loc, 1, FALSE);
 		    if (imm != 0x0a)
 			db_printf("$%#r", imm);
 		    break;
 
 		case Ibs:
 		    get_value_inc(imm, loc, 1, TRUE);
 		    if (size == WORD)
 			imm &= 0xFFFF;
 		    db_printf("$%+#r", imm);
 		    break;
 
 		case Iw:
 		    get_value_inc(imm, loc, 2, FALSE);
 		    db_printf("$%#r", imm);
 		    break;
 
 		case Ilq:
 		    len = db_lengths[rex & REX_W ? QUAD : LONG];
 		    get_value_inc(imm64, loc, len, FALSE);
 		    db_printf("$%#lr", imm64);
 		    break;
 
 		case O:
 		    len = (short_addr ? 2 : 4);
 		    get_value_inc(displ, loc, len, FALSE);
 		    if (seg)
 			db_printf("%s:%+#r",seg, displ);
 		    else
 			db_printsym((db_addr_t)displ, DB_STGY_ANY);
 		    break;
 
 		case Db:
 		    get_value_inc(displ, loc, 1, TRUE);
 		    displ += loc;
 		    if (size == WORD)
 			displ &= 0xFFFF;
 		    db_printsym((db_addr_t)displ, DB_STGY_XTRN);
 		    break;
 
 		case Dl:
 		    len = db_lengths[(size == LONG && (rex & REX_W)) ? QUAD : size];
 		    get_value_inc(displ, loc, len, FALSE);
 		    displ += loc;
 		    if (size == WORD)
 			displ &= 0xFFFF;
 		    db_printsym((db_addr_t)displ, DB_STGY_XTRN);
 		    break;
 
 		case o1:
 		    db_printf("$1");
 		    break;
 
 		case o3:
 		    db_printf("$3");
 		    break;
 
 		case OS:
 		    len = db_lengths[size];
 		    get_value_inc(imm, loc, len, FALSE);	/* offset */
 		    get_value_inc(imm2, loc, 2, FALSE);	/* segment */
 		    db_printf("$%#r,%#r", imm2, imm);
 		    break;
 	    }
 	}
 	db_printf("\n");
 	return (loc);
 }
Index: head/sys/amd64/amd64/pmap.c
===================================================================
--- head/sys/amd64/amd64/pmap.c	(revision 314067)
+++ head/sys/amd64/amd64/pmap.c	(revision 314068)
@@ -1,7299 +1,7299 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #define	AMD64_NPT_AWARE
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_pmap.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 static __inline boolean_t
 pmap_type_guest(pmap_t pmap)
 {
 
 	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
 }
 
 static __inline boolean_t
 pmap_emulate_ad_bits(pmap_t pmap)
 {
 
 	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
 }
 
 static __inline pt_entry_t
 pmap_valid_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_V;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_EMUL_V;
 		else
 			mask = EPT_PG_READ;
 		break;
 	default:
 		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_rw_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_RW;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_EMUL_RW;
 		else
 			mask = EPT_PG_WRITE;
 		break;
 	default:
 		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_global_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 		mask = X86_PG_G;
 		break;
 	case PT_RVI:
 	case PT_EPT:
 		mask = 0;
 		break;
 	default:
 		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_accessed_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_A;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_READ;
 		else
 			mask = EPT_PG_A;
 		break;
 	default:
 		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_modified_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_M;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_WRITE;
 		else
 			mask = EPT_PG_M;
 		break;
 	default:
 		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 extern	struct pcpu __pcpu[];
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 int nkpt;
 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
     "Number of kernel page table pages allocated on bootup");
 
 static int ndmpdp;
 vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 pt_entry_t pg_nx;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int pat_works = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
     "Is page attribute table fully functional?");
 
 static int pg_ps_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 
 #define	PAT_INDEX_SIZE	8
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
 
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
  */
 #define	PMAP_PREINIT_MAPPING_COUNT	8
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	sz;
 	int		mode;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 static int pmap_initialized;
 
 /*
  * Data for the pv entry allocation mechanism.
  * Updates to pv_invl_gen are protected by the pv_list_locks[]
  * elements, but reads are not.
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static u_long pv_invl_gen[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
-pt_entry_t *CMAP1 = 0;
+pt_entry_t *CMAP1 = NULL;
 caddr_t CADDR1 = 0;
 static vm_offset_t qframe = 0;
 static struct mtx qframe_mtx;
 
 static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
 
 int pmap_pcid_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
 int invpcid_works = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
     "Is the invpcid instruction available ?");
 
 static int
 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	uint64_t res;
 
 	res = 0;
 	CPU_FOREACH(i) {
 		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
 	}
 	return (sysctl_handle_64(oidp, &res, 0, req));
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
     "Count of saved TLB context on switch");
 
 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
 static struct mtx invl_gen_mtx;
 static u_long pmap_invl_gen = 0;
 /* Fake lock object to satisfy turnstiles interface. */
 static struct lock_object invl_gen_ts = {
 	.lo_name = "invlts",
 };
 
 #define	PMAP_ASSERT_NOT_IN_DI() \
     KASSERT(curthread->td_md.md_invl_gen.gen == 0, ("DI already started"))
 
 /*
  * Start a new Delayed Invalidation (DI) block of code, executed by
  * the current thread.  Within a DI block, the current thread may
  * destroy both the page table and PV list entries for a mapping and
  * then release the corresponding PV list lock before ensuring that
  * the mapping is flushed from the TLBs of any processors with the
  * pmap active.
  */
 static void
 pmap_delayed_invl_started(void)
 {
 	struct pmap_invl_gen *invl_gen;
 	u_long currgen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	PMAP_ASSERT_NOT_IN_DI();
 	mtx_lock(&invl_gen_mtx);
 	if (LIST_EMPTY(&pmap_invl_gen_tracker))
 		currgen = pmap_invl_gen;
 	else
 		currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
 	invl_gen->gen = currgen + 1;
 	LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
 	mtx_unlock(&invl_gen_mtx);
 }
 
 /*
  * Finish the DI block, previously started by the current thread.  All
  * required TLB flushes for the pages marked by
  * pmap_delayed_invl_page() must be finished before this function is
  * called.
  *
  * This function works by bumping the global DI generation number to
  * the generation number of the current thread's DI, unless there is a
  * pending DI that started earlier.  In the latter case, bumping the
  * global DI generation number would incorrectly signal that the
  * earlier DI had finished.  Instead, this function bumps the earlier
  * DI's generation number to match the generation number of the
  * current thread's DI.
  */
 static void
 pmap_delayed_invl_finished(void)
 {
 	struct pmap_invl_gen *invl_gen, *next;
 	struct turnstile *ts;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	KASSERT(invl_gen->gen != 0, ("missed invl_started"));
 	mtx_lock(&invl_gen_mtx);
 	next = LIST_NEXT(invl_gen, link);
 	if (next == NULL) {
 		turnstile_chain_lock(&invl_gen_ts);
 		ts = turnstile_lookup(&invl_gen_ts);
 		pmap_invl_gen = invl_gen->gen;
 		if (ts != NULL) {
 			turnstile_broadcast(ts, TS_SHARED_QUEUE);
 			turnstile_unpend(ts, TS_SHARED_LOCK);
 		}
 		turnstile_chain_unlock(&invl_gen_ts);
 	} else {
 		next->gen = invl_gen->gen;
 	}
 	LIST_REMOVE(invl_gen, link);
 	mtx_unlock(&invl_gen_mtx);
 	invl_gen->gen = 0;
 }
 
 #ifdef PV_STATS
 static long invl_wait;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
     "Number of times DI invalidation blocked pmap_remove_all/write");
 #endif
 
 static u_long *
 pmap_delayed_invl_genp(vm_page_t m)
 {
 
 	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
 }
 
 /*
  * Ensure that all currently executing DI blocks, that need to flush
  * TLB for the given page m, actually flushed the TLB at the time the
  * function returned.  If the page m has an empty PV list and we call
  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
  * valid mapping for the page m in either its page table or TLB.
  *
  * This function works by blocking until the global DI generation
  * number catches up with the generation number associated with the
  * given page m and its PV list.  Since this function's callers
  * typically own an object lock and sometimes own a page lock, it
  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
  * processor.
  */
 static void
 pmap_delayed_invl_wait(vm_page_t m)
 {
 	struct thread *td;
 	struct turnstile *ts;
 	u_long *m_gen;
 #ifdef PV_STATS
 	bool accounted = false;
 #endif
 
 	td = curthread;
 	m_gen = pmap_delayed_invl_genp(m);
 	while (*m_gen > pmap_invl_gen) {
 #ifdef PV_STATS
 		if (!accounted) {
 			atomic_add_long(&invl_wait, 1);
 			accounted = true;
 		}
 #endif
 		ts = turnstile_trywait(&invl_gen_ts);
 		if (*m_gen > pmap_invl_gen)
 			turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
 		else
 			turnstile_cancel(ts);
 	}
 }
 
 /*
  * Mark the page m's PV list as participating in the current thread's
  * DI block.  Any threads concurrently using m's PV list to remove or
  * restrict all mappings to m will wait for the current thread's DI
  * block to complete before proceeding.
  *
  * The function works by setting the DI generation number for m's PV
  * list to at least the DI generation number of the current thread.
  * This forces a caller of pmap_delayed_invl_wait() to block until
  * current thread calls pmap_delayed_invl_finished().
  */
 static void
 pmap_delayed_invl_page(vm_page_t m)
 {
 	u_long gen, *m_gen;
 
 	rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
 	gen = curthread->td_md.md_invl_gen.gen;
 	if (gen == 0)
 		return;
 	m_gen = pmap_delayed_invl_genp(m);
 	if (*m_gen < gen)
 		*m_gen = gen;
 }
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static int	popcnt_pc_map_pq(uint64_t *map);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	reserve_pv_entries(pmap_t pmap, int needed,
 		    struct rwlock **lockp);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
     vm_offset_t va, struct rwlock **lockp);
 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
     vm_offset_t va);
 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp);
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     struct spglist *free);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
 		struct rwlock **lockp);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 
 /*
  * Move the kernel virtual free pointer to the next
  * 2MB.  This is used to help improve performance
  * by using a large (2MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
 
 	newaddr = roundup2(addr, NBPDR);
 	return (newaddr);
 }
 
 /********************/
 /* Inline functions */
 /********************/
 
 /* Return a non-clipped PD index for a given VA */
 static __inline vm_pindex_t
 pmap_pde_pindex(vm_offset_t va)
 {
 	return (va >> PDRSHIFT);
 }
 
 
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 {
 
 	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 	return (&pdpe[pmap_pdpe_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pml4e = pmap_pml4e(pmap, va);
 	if ((*pml4e & PG_V) == 0)
 		return (NULL);
 	return (pmap_pml4e_to_pdpe(pml4e, va));
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pd_entry_t *pde;
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 	return (&pde[pmap_pde_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		return (NULL);
 	return (pmap_pdpe_to_pde(pdpe, va));
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 	return (&pte[pmap_pte_index(va)]);
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return (NULL);
 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
 		return ((pt_entry_t *)pde);
 	return (pmap_pde_to_pte(pde, va));
 }
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 PMAP_INLINE pt_entry_t *
 vtopte(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
 
 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
 }
 
 static __inline pd_entry_t *
 vtopde(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
 
 	return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
 static u_int64_t
 allocpages(vm_paddr_t *firstaddr, int n)
 {
 	u_int64_t ret;
 
 	ret = *firstaddr;
 	bzero((void *)ret, n * PAGE_SIZE);
 	*firstaddr += n * PAGE_SIZE;
 	return (ret);
 }
 
 CTASSERT(powerof2(NDMPML4E));
 
 /* number of kernel PDP slots */
 #define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
 
 static void
 nkpt_init(vm_paddr_t addr)
 {
 	int pt_pages;
 	
 #ifdef NKPT
 	pt_pages = NKPT;
 #else
 	pt_pages = howmany(addr, 1 << PDRSHIFT);
 	pt_pages += NKPDPE(pt_pages);
 
 	/*
 	 * Add some slop beyond the bare minimum required for bootstrapping
 	 * the kernel.
 	 *
 	 * This is quite important when allocating KVA for kernel modules.
 	 * The modules are required to be linked in the negative 2GB of
 	 * the address space.  If we run out of KVA in this region then
 	 * pmap_growkernel() will need to allocate page table pages to map
 	 * the entire 512GB of KVA space which is an unnecessary tax on
 	 * physical memory.
 	 *
 	 * Secondly, device memory mapped as part of setting up the low-
 	 * level console(s) is taken from KVA, starting at virtual_avail.
 	 * This is because cninit() is called after pmap_bootstrap() but
 	 * before vm_init() and pmap_init(). 20MB for a frame buffer is
 	 * not uncommon.
 	 */
 	pt_pages += 32;		/* 64MB additional slop. */
 #endif
 	nkpt = pt_pages;
 }
 
 static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
 	int i, j, ndm1g, nkpdpe;
 	pt_entry_t *pt_p;
 	pd_entry_t *pd_p;
 	pdp_entry_t *pdp_p;
 	pml4_entry_t *p4_p;
 
 	/* Allocate page table pages for the direct map */
 	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
 	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
 	if (ndmpdpphys > NDMPML4E) {
 		/*
 		 * Each NDMPML4E allows 512 GB, so limit to that,
 		 * and then readjust ndmpdp and ndmpdpphys.
 		 */
 		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
 		Maxmem = atop(NDMPML4E * NBPML4);
 		ndmpdpphys = NDMPML4E;
 		ndmpdp = NDMPML4E * NPDEPG;
 	}
 	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 	ndm1g = 0;
 	if ((amd_feature & AMDID_PAGE1GB) != 0)
 		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
 	if (ndm1g < ndmpdp)
 		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
 	/* Allocate pages */
 	KPML4phys = allocpages(firstaddr, 1);
 	KPDPphys = allocpages(firstaddr, NKPML4E);
 
 	/*
 	 * Allocate the initial number of kernel page table pages required to
 	 * bootstrap.  We defer this until after all memory-size dependent
 	 * allocations are done (e.g. direct map), so that we don't have to
 	 * build in too much slop in our estimate.
 	 *
 	 * Note that when NKPML4E > 1, we have an empty page underneath
 	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
 	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 	 */
 	nkpt_init(*firstaddr);
 	nkpdpe = NKPDPE(nkpt);
 
 	KPTphys = allocpages(firstaddr, nkpt);
 	KPDphys = allocpages(firstaddr, nkpdpe);
 
 	/* Fill in the underlying page table pages */
 	/* Nominally read-only (but really R/W) from zero to physfree */
 	/* XXX not fully used, underneath 2M pages */
 	pt_p = (pt_entry_t *)KPTphys;
 	for (i = 0; ptoa(i) < *firstaddr; i++)
 		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
 
 	/* Now map the page tables at their location within PTmap */
 	pd_p = (pd_entry_t *)KPDphys;
 	for (i = 0; i < nkpt; i++)
 		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/* Map from zero to end of allocations under 2M pages */
 	/* This replaces some of the KPTphys entries above */
 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
 		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
 		    X86_PG_G;
 
 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
 	for (i = 0; i < nkpdpe; i++)
 		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
 		    PG_U;
 
 	/*
 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
 	 * the end of physical memory is not aligned to a 1GB page boundary,
 	 * then the residual physical memory is mapped with 2MB pages.  Later,
 	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
 	 * that are partially used. 
 	 */
 	pd_p = (pd_entry_t *)DMPDphys;
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
 		    X86_PG_M | X86_PG_A;
 	}
 	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
 		    X86_PG_M | X86_PG_A;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
 		pdp_p[i] = DMPDphys + ptoa(j);
 		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
 	p4_p = (pml4_entry_t *)KPML4phys;
 	p4_p[PML4PML4I] = KPML4phys;
 	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
 
 	/* Connect the Direct Map slot(s) up to the PML4. */
 	for (i = 0; i < ndmpdpphys; i++) {
 		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
 		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
 	}
 
 	/* Connect the KVA slots up to the PML4 */
 	for (i = 0; i < NKPML4E; i++) {
 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
 		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
 	}
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On amd64 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t *firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 	int i;
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
 	create_pagetables(firstaddr);
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated kernel page table pages so that vm_page structures
 	 * representing these pages will be created.  The vm_page structures
 	 * are required for promotion of the corresponding kernel virtual
 	 * addresses to superpage mappings.
 	 */
 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 
 	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 
 	/* XXX do %cr0 as well */
 	load_cr4(rcr4() | CR4_PGE);
 	load_cr3(KPML4phys);
 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 		load_cr4(rcr4() | CR4_SMEP);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	kernel_pmap->pm_cr3 = KPML4phys;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	kernel_pmap->pm_flags = pmap_flags;
 
  	/*
 	 * Initialize the TLB invalidations generation number lock.
 	 */
 	mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * Crashdump maps.  The first page is reused as CMAP1 for the
 	 * memory test.
 	 */
 	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
 	CADDR1 = crashdumpmap;
 
 	virtual_avail = va;
 
 	/*
 	 * Initialize the PAT MSR.
 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
 	 * side-effect, invalidates stale PG_G TLB entries that might
 	 * have been created in our pre-boot environment.
 	 */
 	pmap_init_pat();
 
 	/* Initialize TLB Context Id. */
 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 		/* Check for INVPCID support */
 		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
 		    != 0;
 		for (i = 0; i < MAXCPU; i++) {
 			kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
 			kernel_pmap->pm_pcids[i].pm_gen = 1;
 		}
 		__pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1;
 		__pcpu[0].pc_pcid_gen = 1;
 		/*
 		 * pcpu area for APs is zeroed during AP startup.
 		 * pc_pcid_next and pc_pcid_gen are initialized by AP
 		 * during pcpu setup.
 		 */
 		load_cr4(rcr4() | CR4_PCIDE);
 	} else {
 		pmap_pcid_enabled = 0;
 	}
 }
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	int pat_table[PAT_INDEX_SIZE];
 	uint64_t pat_msr;
 	u_long cr0, cr4;
 	int i;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if ((cpu_feature & CPUID_PAT) == 0)
 		panic("no PAT??");
 
 	/* Set default PAT index table. */
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_table[i] = -1;
 	pat_table[PAT_WRITE_BACK] = 0;
 	pat_table[PAT_WRITE_THROUGH] = 1;
 	pat_table[PAT_UNCACHEABLE] = 3;
 	pat_table[PAT_WRITE_COMBINING] = 3;
 	pat_table[PAT_WRITE_PROTECTED] = 3;
 	pat_table[PAT_UNCACHED] = 3;
 
 	/* Initialize default PAT entries. */
 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(2, PAT_UNCACHED) |
 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
 	    PAT_VALUE(4, PAT_WRITE_BACK) |
 	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(6, PAT_UNCACHED) |
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	if (pat_works) {
 		/*
 		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 		 * Program 5 and 6 as WP and WC.
 		 * Leave 4 and 7 as WB and UC.
 		 */
 		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
 		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 		    PAT_VALUE(6, PAT_WRITE_COMBINING);
 		pat_table[PAT_UNCACHED] = 2;
 		pat_table[PAT_WRITE_PROTECTED] = 5;
 		pat_table[PAT_WRITE_COMBINING] = 6;
 	} else {
 		/*
 		 * Just replace PAT Index 2 with WC instead of UC-.
 		 */
 		pat_msr &= ~PAT_MASK(2);
 		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 		pat_table[PAT_WRITE_COMBINING] = 2;
 	}
 
 	/* Disable PGE. */
 	cr4 = rcr4();
 	load_cr4(cr4 & ~CR4_PGE);
 
 	/* Disable caches (CD = 1, NW = 0). */
 	cr0 = rcr0();
 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 
 	/* Flushes caches and TLBs. */
 	wbinvd();
 	invltlb();
 
 	/* Update PAT and index table. */
 	wrmsr(MSR_PAT, pat_msr);
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_index[i] = pat_table[i];
 
 	/* Flush caches and TLBs again. */
 	wbinvd();
 	invltlb();
 
 	/* Restore caches and PGE. */
 	load_cr0(cr0);
 	load_cr4(cr4);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pat_mode = PAT_WRITE_BACK;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t mpte;
 	vm_size_t s;
 	int error, i, pv_npg;
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
 	for (i = 0; i < nkpt; i++) {
 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range"));
 		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 	}
 
 	/*
 	 * If the kernel is running on a virtual machine, then it must assume
 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 	 * be prepared for the hypervisor changing the vendor and family that
 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
 	 * 10h Erratum 383 is enabled if the processor's feature set does not
 	 * include at least one feature that is only supported by older Intel
 	 * or newer AMD processors.
 	 */
 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 	    AMDID2_FMA4)) == 0)
 		workaround_erratum383 = 1;
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 	if (pg_ps_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = NBPDR;
 	}
 
 	/*
 	 * Initialize the pv chunk list mutex.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	pmap_initialized = 1;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == 0)
 			continue;
 		/* Make the direct map consistent */
 		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
 			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
 			    ppim->sz, ppim->mode);
 		}
 		if (!bootverbose)
 			continue;
 		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
 		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
 	}
 
 	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 	    (vmem_addr_t *)&qframe);
 	if (error != 0)
 		panic("qframe allocation failed");
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
     "2MB page mapping counters");
 
 static u_long pmap_pde_demotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pde_demotions, 0, "2MB page demotions");
 
 static u_long pmap_pde_mappings;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_pde_mappings, 0, "2MB page mappings");
 
 static u_long pmap_pde_p_failures;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_pde_p_failures, 0, "2MB page promotion failures");
 
 static u_long pmap_pde_promotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_pde_promotions, 0, "2MB page promotions");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
     "1GB page mapping counters");
 
 static u_long pmap_pdpe_demotions;
 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pdpe_demotions, 0, "1GB page demotions");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 static pt_entry_t
 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
 {
 	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* Verify that both PAT bits are not set at the same time */
 		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
 		    ("Invalid PAT bits in entry %#lx", entry));
 
 		/* Swap the PAT bits if one of them is set */
 		if ((entry & x86_pat_bits) != 0)
 			entry ^= x86_pat_bits;
 		break;
 	case PT_EPT:
 		/*
 		 * Nothing to do - the memory attributes are represented
 		 * the same way for regular pages and superpages.
 		 */
 		break;
 	default:
 		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
 	}
 
 	return (entry);
 }
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 int
 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
 	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
 		panic("Unknown caching mode %d\n", mode);
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* The PAT bit is different for PTE's and PDE's. */
 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 
 		/* Map the caching mode to a PAT index. */
 		pat_idx = pat_index[mode];
 
 		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 		cache_bits = 0;
 		if (pat_idx & 0x4)
 			cache_bits |= pat_flag;
 		if (pat_idx & 0x2)
 			cache_bits |= PG_NC_PCD;
 		if (pat_idx & 0x1)
 			cache_bits |= PG_NC_PWT;
 		break;
 
 	case PT_EPT:
 		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
 		break;
 
 	default:
 		panic("unsupported pmap type %d", pmap->pm_type);
 	}
 
 	return (cache_bits);
 }
 
 static int
 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 {
 	int mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
 		break;
 	case PT_EPT:
 		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
 		break;
 	default:
 		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline boolean_t
 pmap_ps_enabled(pmap_t pmap)
 {
 
 	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 }
 
 static void
 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 		break;
 	case PT_RVI:
 	case PT_EPT:
 		/*
 		 * XXX
 		 * This is a little bogus since the generation number is
 		 * supposed to be bumped up when a region of the address
 		 * space is invalidated in the page tables.
 		 *
 		 * In this case the old PDE entry is valid but yet we want
 		 * to make sure that any mappings using the old entry are
 		 * invalidated in the TLB.
 		 *
 		 * The reason this works as expected is because we rendezvous
 		 * "all" host cpus and force any vcpu context to exit as a
 		 * side-effect.
 		 */
 		atomic_add_acq_long(&pmap->pm_eptgen, 1);
 		break;
 	default:
 		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
 	}
 	pde_store(pde, newpde);
 }
 
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
  * calling processor's TLB is affected.
  *
  * The calling thread must be pinned to a processor.
  */
 static void
 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 {
 	pt_entry_t PG_G;
 
 	if (pmap_type_guest(pmap))
 		return;
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
 
 	PG_G = pmap_global_bit(pmap);
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
 		invlpg(va);
 	else if ((newpde & PG_G) == 0)
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB
 		 * because there are too many to flush individually.
 		 */
 		invltlb();
 	else {
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB,
 		 * including any global (PG_G) mappings.
 		 */
 		invltlb_glob();
 	}
 }
 #ifdef SMP
 
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  *
  * N.B.: Before calling any of the following TLB invalidation functions,
  * the calling processor must ensure that all stores updating a non-
  * kernel page table are globally performed.  Otherwise, another
  * processor could cache an old, pre-update entry without being
  * invalidated.  This can happen one of two ways: (1) The pmap becomes
  * active on another processor after its pm_active field is checked by
  * one of the following functions but before a store updating the page
  * table is globally performed. (2) The pmap becomes active on another
  * processor before its pm_active field is checked but due to
  * speculative loads one of the following functions stills reads the
  * pmap as inactive on the other processor.
  * 
  * The kernel page table is exempt because its pm_active field is
  * immutable.  The kernel page table is always active on every
  * processor.
  */
 
 /*
  * Interrupt the cpus that are executing in the guest context.
  * This will force the vcpu to exit and the cached EPT mappings
  * will be invalidated by the host before the next vmresume.
  */
 static __inline void
 pmap_invalidate_ept(pmap_t pmap)
 {
 	int ipinum;
 
 	sched_pin();
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("pmap_invalidate_ept: absurd pm_active"));
 
 	/*
 	 * The TLB mappings associated with a vcpu context are not
 	 * flushed each time a different vcpu is chosen to execute.
 	 *
 	 * This is in contrast with a process's vtop mappings that
 	 * are flushed from the TLB on each context switch.
 	 *
 	 * Therefore we need to do more than just a TLB shootdown on
 	 * the active cpus in 'pmap->pm_active'. To do this we keep
 	 * track of the number of invalidations performed on this pmap.
 	 *
 	 * Each vcpu keeps a cache of this counter and compares it
 	 * just before a vmresume. If the counter is out-of-date an
 	 * invept will be done to flush stale mappings from the TLB.
 	 */
 	atomic_add_acq_long(&pmap->pm_eptgen, 1);
 
 	/*
 	 * Force the vcpu to exit and trap back into the hypervisor.
 	 */
 	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
 	ipi_selected(pmap->pm_active, ipinum);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t *mask;
 	u_int cpuid, i;
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		invlpg(va);
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		if (pmap == PCPU_GET(curpmap))
 			invlpg(va);
 		else if (pmap_pcid_enabled)
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		if (pmap_pcid_enabled) {
 			CPU_FOREACH(i) {
 				if (cpuid != i)
 					pmap->pm_pcids[i].pm_gen = 0;
 			}
 		}
 		mask = &pmap->pm_active;
 	}
 	smp_masked_invlpg(*mask, va);
 	sched_unpin();
 }
 
 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	cpuset_t *mask;
 	vm_offset_t addr;
 	u_int cpuid, i;
 
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 		pmap_invalidate_all(pmap);
 		return;
 	}
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	if (pmap == kernel_pmap) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		mask = &all_cpus;
 	} else {
 		if (pmap == PCPU_GET(curpmap)) {
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		} else if (pmap_pcid_enabled) {
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		}
 		if (pmap_pcid_enabled) {
 			CPU_FOREACH(i) {
 				if (cpuid != i)
 					pmap->pm_pcids[i].pm_gen = 0;
 			}
 		}
 		mask = &pmap->pm_active;
 	}
 	smp_masked_invlpg_range(*mask, sva, eva);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	cpuset_t *mask;
 	struct invpcid_descr d;
 	u_int cpuid, i;
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		if (pmap_pcid_enabled && invpcid_works) {
 			bzero(&d, sizeof(d));
 			invpcid(&d, INVPCID_CTXGLOB);
 		} else {
 			invltlb_glob();
 		}
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		if (pmap == PCPU_GET(curpmap)) {
 			if (pmap_pcid_enabled) {
 				if (invpcid_works) {
 					d.pcid = pmap->pm_pcids[cpuid].pm_pcid;
 					d.pad = 0;
 					d.addr = 0;
 					invpcid(&d, INVPCID_CTX);
 				} else {
 					load_cr3(pmap->pm_cr3 | pmap->pm_pcids
 					    [PCPU_GET(cpuid)].pm_pcid);
 				}
 			} else {
 				invltlb();
 			}
 		} else if (pmap_pcid_enabled) {
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		}
 		if (pmap_pcid_enabled) {
 			CPU_FOREACH(i) {
 				if (cpuid != i)
 					pmap->pm_pcids[i].pm_gen = 0;
 			}
 		}
 		mask = &pmap->pm_active;
 	}
 	smp_masked_invltlb(*mask, pmap);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
 	pmap_t pmap;
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
 	u_int store;		/* processor that updates the PDE */
 };
 
 static void
 pmap_update_pde_action(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
 		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 }
 
 static void
 pmap_update_pde_teardown(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 }
 
 /*
  * Change the page size for the specified virtual address in a way that
  * prevents any possibility of the TLB ever having two entries that map the
  * same virtual address using different page sizes.  This is the recommended
  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  * machine check exception for a TLB state that is improperly diagnosed as a
  * hardware error.
  */
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 	struct pde_action act;
 	cpuset_t active, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
 	if (pmap == kernel_pmap || pmap_type_guest(pmap)) 
 		active = all_cpus;
 	else {
 		active = pmap->pm_active;
 	}
 	if (CPU_OVERLAP(&active, &other_cpus)) { 
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
 		act.pmap = pmap;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
 		smp_rendezvous_cpus(active,
 		    smp_no_rendevous_barrier, pmap_update_pde_action,
 		    pmap_update_pde_teardown, &act);
 	} else {
 		pmap_update_pde_store(pmap, pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
 			pmap_update_pde_invalidate(pmap, va, newpde);
 	}
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, invalidation functions.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
 		invlpg(va);
 	else if (pmap_pcid_enabled)
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	struct invpcid_descr d;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap) {
 		if (pmap_pcid_enabled && invpcid_works) {
 			bzero(&d, sizeof(d));
 			invpcid(&d, INVPCID_CTXGLOB);
 		} else {
 			invltlb_glob();
 		}
 	} else if (pmap == PCPU_GET(curpmap)) {
 		if (pmap_pcid_enabled) {
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid;
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
 			} else {
 				load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0].
 				    pm_pcid);
 			}
 		} else {
 			invltlb();
 		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	pmap_update_pde_store(pmap, pde, newpde);
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
 		pmap_update_pde_invalidate(pmap, va, newpde);
 	else
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 #endif /* !SMP */
 
 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 
 void
 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
 {
 
 	if (force) {
 		sva &= ~(vm_offset_t)cpu_clflush_line_size;
 	} else {
 		KASSERT((sva & PAGE_MASK) == 0,
 		    ("pmap_invalidate_cache_range: sva not page-aligned"));
 		KASSERT((eva & PAGE_MASK) == 0,
 		    ("pmap_invalidate_cache_range: eva not page-aligned"));
 	}
 
 	if ((cpu_feature & CPUID_SS) != 0 && !force)
 		; /* If "Self Snoop" is supported and allowed, do nothing. */
 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 		/*
 		 * XXX: Some CPUs fault, hang, or trash the local APIC
 		 * registers if we use CLFLUSH on the local APIC
 		 * range.  The local APIC is always uncached, so we
 		 * don't need to flush for that range anyway.
 		 */
 		if (pmap_kextract(sva) == lapic_paddr)
 			return;
 
 		/*
 		 * Otherwise, do per-cache line flush.  Use the sfence
 		 * instruction to insure that previous stores are
 		 * included in the write-back.  The processor
 		 * propagates flush to other processors in the cache
 		 * coherence domain.
 		 */
 		sfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflushopt(sva);
 		sfence();
 	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 		if (pmap_kextract(sva) == lapic_paddr)
 			return;
 		/*
 		 * Writes are ordered by CLFLUSH on Intel CPUs.
 		 */
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflush(sva);
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	} else {
 
 		/*
 		 * No targeted cache flush methods are supported by CPU,
 		 * or the supplied range is bigger than 2MB.
 		 * Globally invalidate cache.
 		 */
 		pmap_invalidate_cache();
 	}
 }
 
 /*
  * Remove the specified set of pages from the data and instruction caches.
  *
  * In contrast to pmap_invalidate_cache_range(), this function does not
  * rely on the CPU's self-snoop feature, because it is intended for use
  * when moving pages into a different cache domain.
  */
 void
 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 {
 	vm_offset_t daddr, eva;
 	int i;
 	bool useclflushopt;
 
 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
 		pmap_invalidate_cache();
 	else {
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (i = 0; i < count; i++) {
 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 			eva = daddr + PAGE_SIZE;
 			for (; daddr < eva; daddr += cpu_clflush_line_size) {
 				if (useclflushopt)
 					clflushopt(daddr);
 				else
 					clflush(daddr);
 			}
 		}
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	}
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	vm_paddr_t pa;
 
 	pa = 0;
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		if ((*pdpe & PG_PS) != 0)
 			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 		else {
 			pde = pmap_pdpe_to_pde(pdpe, va);
 			if ((*pde & PG_V) != 0) {
 				if ((*pde & PG_PS) != 0) {
 					pa = (*pde & PG_PS_FRAME) |
 					    (va & PDRMASK);
 				} else {
 					pte = pmap_pde_to_pte(pde, va);
 					pa = (*pte & PG_FRAME) |
 					    (va & PAGE_MASK);
 				}
 			}
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde, *pdep;
 	pt_entry_t pte, PG_RW, PG_V;
 	vm_paddr_t pa;
 	vm_page_t m;
 
 	pa = 0;
 	m = NULL;
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, va);
 	if (pdep != NULL && (pde = *pdep)) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				if (vm_page_pa_tryrelock(pmap, (pde &
 				    PG_PS_FRAME) | (va & PDRMASK), &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
 		} else {
 			pte = *pmap_pde_to_pte(pdep, va);
 			if ((pte & PG_V) &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 				    &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 		}
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t pde;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		pde = *vtopde(va);
 		if (pde & PG_PS) {
 			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 		} else {
 			/*
 			 * Beware of a concurrent promotion that changes the
 			 * PDE at this point!  For example, vtopte() must not
 			 * be used to access the PTE because it would use the
 			 * new PDE.  It is, however, safe to use the old PDE
 			 * because the page table page is preserved by the
 			 * promotion.
 			 */
 			pa = *pmap_pde_to_pte(&pde, va);
 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	return (pa);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
 }
 
 static __inline void
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 	int cache_bits;
 
 	pte = vtopte(va);
 	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
 	int cache_bits;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
 		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
 			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
 		}
 		pte++;
 	}
 	if (__predict_false((oldpte & X86_PG_V) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 static __inline void
 pmap_free_zero_pages(struct spglist *free)
 {
 	vm_page_t m;
 
 	while ((m = SLIST_FIRST(free)) != NULL) {
 		SLIST_REMOVE_HEAD(free, plinks.s.ss);
 		/* Preserve the page's PG_ZERO setting. */
 		vm_page_free_toq(m);
 	}
 }
 
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 	
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= (NUPDE + NUPDPE)) {
 		/* PDP page */
 		pml4_entry_t *pml4;
 		pml4 = pmap_pml4e(pmap, va);
 		*pml4 = 0;
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pdp_entry_t *pdp;
 		pdp = pmap_pdpe(pmap, va);
 		*pdp = 0;
 	} else {
 		/* PTE page */
 		pd_entry_t *pd;
 		pd = pmap_pde(pmap, va);
 		*pd = 0;
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUPDE) {
 		/* We just released a PT, unhold the matching PD */
 		vm_page_t pdpg;
 
 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdpg, free);
 	}
 	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 		/* We just released a PD, unhold the matching PDP */
 		vm_page_t pdppg;
 
 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdppg, free);
 	}
 
 	/*
 	 * This is a release store so that the ordinary store unmapping
 	 * the page table page is globally performed before TLB shoot-
 	 * down is begun.
 	 */
 	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	pmap->pm_cr3 = KPML4phys;
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = pmap_flags;
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
 	PCPU_SET(curpmap, kernel_pmap);
 	pmap_activate(curthread);
 	CPU_FILL(&kernel_pmap->pm_active);
 }
 
 void
 pmap_pinit_pml4(vm_page_t pml4pg)
 {
 	pml4_entry_t *pm_pml4;
 	int i;
 
 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 
 	/* Wire in kernel global address entries. */
 	for (i = 0; i < NKPML4E; i++) {
 		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V | PG_U;
 	}
 	for (i = 0; i < ndmpdpphys; i++) {
 		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V | PG_U;
 	}
 
 	/* install self-referential address mapping entry(s) */
 	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
 	    X86_PG_A | X86_PG_M;
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 int
 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
 	vm_page_t pml4pg;
 	vm_paddr_t pml4phys;
 	int i;
 
 	/*
 	 * allocate the page directory page
 	 */
 	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		VM_WAIT;
 
 	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
 	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
 
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
 	/*
 	 * Do not install the host kernel mappings in the nested page
 	 * tables. These mappings are meaningless in the guest physical
 	 * address space.
 	 */
 	if ((pmap->pm_type = pm_type) == PT_X86) {
 		pmap->pm_cr3 = pml4phys;
 		pmap_pinit_pml4(pml4pg);
 	}
 
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = flags;
 	pmap->pm_eptgen = 0;
 
 	return (1);
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 
 	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, pdppg, pdpg;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			PMAP_ASSERT_NOT_IN_DI();
 			VM_WAIT;
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
 		pml4_entry_t *pml4;
 		vm_pindex_t pml4index;
 
 		/* Wire up a new PDPE page */
 		pml4index = ptepindex - (NUPDE + NUPDPE);
 		pml4 = &pmap->pm_pml4[pml4index];
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 
 		/* Wire up a new PDE page */
 		pdpindex = ptepindex - NUPDE;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 			    lockp) == NULL) {
 				--m->wire_count;
 				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			/* Add reference to pdp page */
 			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 			pdppg->wire_count++;
 		}
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 
 		/* Now find the pdp page */
 		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 		pd_entry_t *pd;
 
 		/* Wire up a new PTE page */
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		/* First, find the pdp and check that its valid. */
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 			    lockp) == NULL) {
 				--m->wire_count;
 				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		} else {
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 				    lockp) == NULL) {
 					--m->wire_count;
 					atomic_subtract_int(&vm_cnt.v_wire_count,
 					    1);
 					vm_page_free_zero(m);
 					return (NULL);
 				}
 			} else {
 				/* Add reference to the pd page */
 				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 				pdpg->wire_count++;
 			}
 		}
 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 
 		/* Now we know where the page directory page is */
 		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pdp_entry_t *pdpe, PG_V;
 	vm_page_t pdpg;
 
 	PG_V = pmap_valid_bit(pmap);
 
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		/* Add a reference to the pd page. */
 		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 		pdpg->wire_count++;
 	} else {
 		/* Allocate a pd page. */
 		ptepindex = pmap_pde_pindex(va);
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 		if (pdpg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (pdpg);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd, PG_V;
 	vm_page_t m;
 
 	PG_V = pmap_valid_bit(pmap);
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_pde_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pd = pmap_pde(pmap, va);
 
 	/*
 	 * This supports switching from a 2MB page to a
 	 * normal 4K page.
 	 */
 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 			/*
 			 * Invalidation of the 2MB page mapping may have caused
 			 * the deallocation of the underlying PD page.
 			 */
 			pd = NULL;
 		}
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pd != NULL && (*pd & PG_V) != 0) {
 		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, lockp);
 		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
 
 	for (i = 0; i < NKPML4E; i++)	/* KVA */
 		pmap->pm_pml4[KPML4BASE + i] = 0;
 	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 		pmap->pm_pml4[DMPML4I + i] = 0;
 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
 
 	m->wire_count--;
 	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 	vm_page_free_zero(m);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "LU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *pde, newpdir;
 	pdp_entry_t *pdpe;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	/*
 	 * Return if "addr" is within the range of kernel page table pages
 	 * that were preallocated during pmap bootstrap.  Moreover, leave
 	 * "kernel_vm_end" and the kernel page table as they were.
 	 *
 	 * The correctness of this action is based on the following
 	 * argument: vm_map_insert() allocates contiguous ranges of the
 	 * kernel virtual address space.  It calls this function if a range
 	 * ends after "kernel_vm_end".  If the kernel is mapped between
 	 * "kernel_vm_end" and "addr", then the range cannot begin at
 	 * "kernel_vm_end".  In fact, its beginning address cannot be less
 	 * than the kernel.  Thus, there is no immediate need to allocate
 	 * any new kernel page table pages between "kernel_vm_end" and
 	 * "KERNBASE".
 	 */
 	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 		return;
 
 	addr = roundup2(addr, NBPDR);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 		if ((*pdpe & X86_PG_V) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
 			    X86_PG_A | X86_PG_M);
 			continue; /* try again */
 		}
 		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 		if ((*pde & X86_PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;                       
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 		pde_store(pde, newpdir);
 
 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 			kernel_vm_end = kernel_map->max_offset;
 			break;                       
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 	pmap = NULL;
 	m_pc = NULL;
 	PG_G = PG_A = PG_M = PG_RW = 0;
 	SLIST_INIT(&free);
 	TAILQ_INIT(&new_tail);
 	pmap_delayed_invl_started();
 	mtx_lock(&pv_chunks_mutex);
 	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 		if (pmap != pc->pc_pmap) {
 			if (pmap != NULL) {
 				pmap_invalidate_all(pmap);
 				if (pmap != locked_pmap)
 					PMAP_UNLOCK(pmap);
 			}
 			pmap_delayed_invl_finished();
 			pmap_delayed_invl_started();
 			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap) {
 				RELEASE_PV_LIST_LOCK(lockp);
 				PMAP_LOCK(pmap);
 			} else if (pmap != locked_pmap &&
 			    !PMAP_TRYLOCK(pmap)) {
 				pmap = NULL;
 				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 				mtx_lock(&pv_chunks_mutex);
 				continue;
 			}
 			PG_G = pmap_global_bit(pmap);
 			PG_A = pmap_accessed_bit(pmap);
 			PG_M = pmap_modified_bit(pmap);
 			PG_RW = pmap_rw_bit(pmap);
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = bsfq(inuse);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va);
 				if ((*pde & PG_PS) != 0)
 					continue;
 				pte = pmap_pde_to_pte(pde, va);
 				if ((*pte & PG_W) != 0)
 					continue;
 				tpte = pte_load_clear(pte);
 				if ((tpte & PG_G) != 0)
 					pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 					vm_page_dirty(m);
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pmap_delayed_invl_page(m);
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, *pde, &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 			mtx_lock(&pv_chunks_mutex);
 			continue;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap_resident_count_dec(pmap, freed);
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 		    pc->pc_map[2] == PC_FREE2) {
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m_pc->phys_addr);
 			mtx_lock(&pv_chunks_mutex);
 			break;
 		}
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		mtx_lock(&pv_chunks_mutex);
 		/* One freed pv entry in locked_pmap is sufficient. */
 		if (pmap == locked_pmap)
 			break;
 	}
 	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	if (pmap != NULL) {
 		pmap_invalidate_all(pmap);
 		if (pmap != locked_pmap)
 			PMAP_UNLOCK(pmap);
 	}
 	pmap_delayed_invl_finished();
 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 	}
 	pmap_free_zero_pages(&free);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire(m, PQ_NONE);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Returns the number of one bits within the given PV chunk map.
  *
  * The erratas for Intel processors state that "POPCNT Instruction May
  * Take Longer to Execute Than Expected".  It is believed that the
  * issue is the spurious dependency on the destination register.
  * Provide a hint to the register rename logic that the destination
  * value is overwritten, by clearing it, as suggested in the
  * optimization manual.  It should be cheap for unaffected processors
  * as well.
  *
  * Reference numbers for erratas are
  * 4th Gen Core: HSD146
  * 5th Gen Core: BDM85
  * 6th Gen Core: SKL029
  */
 static int
 popcnt_pc_map_pq(uint64_t *map)
 {
 	u_long result, tmp;
 
 	__asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
 	    "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
 	    "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
 	    : "=&r" (result), "=&r" (tmp)
 	    : "m" (map[0]), "m" (map[1]), "m" (map[2]));
 	return (result);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	int avail, free;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 #ifndef __POPCNT__
 		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
 			bit_count((bitstr_t *)pc->pc_map, 0,
 			    sizeof(pc->pc_map) * NBBY, &free);
 		else
 #endif
 		free = popcnt_pc_map_pq(pc->pc_map);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 		}
 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void
 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 	int bit, field;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 	va_last = va + NBPDR - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_pde: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 }
 
 /*
  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  * replace the many pv entries for the 4KB page mappings by a single pv entry
  * for the 2MB page mapping.
  */
 static void
 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 	 * a transfer avoids the possibility that get_pv_entry() calls
 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Conditionally create the PV entry for a 2MB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 		pvh = pa_to_pvh(pa);
 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 		pvh->pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Fills a page table page with mappings to consecutive physical pages.
  */
 static void
 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 {
 	pt_entry_t *pte;
 
 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 		*pte = newpte;
 		newpte += PAGE_SIZE;
 	}
 }
 
 /*
  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
  * mapping is invalidated.
  */
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	lock = NULL;
 	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (rv);
 }
 
 static boolean_t
 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	struct spglist free;
 	vm_offset_t sva;
 	int PG_PTE_CACHE;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpde = *pde;
 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 	    NULL) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * Invalidate the 2MB page mapping and return "failure" if the
 		 * mapping was never accessed or the allocation of the new
 		 * page table page fails.  If the 2MB page mapping belongs to
 		 * the direct map region of the kernel's address space, then
 		 * the page allocation request specifies the highest possible
 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
 		 * normal.  Page table pages are preallocated for every other
 		 * part of the kernel address space, so the direct map region
 		 * is the only part of the kernel address space that must be
 		 * handled here.
 		 */
 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
 		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 			SLIST_INIT(&free);
 			sva = trunc_2mpage(va);
 			pmap_remove_pde(pmap, pde, sva, &free, lockp);
 			pmap_invalidate_range(pmap, sva, sva + NBPDR - 1);
 			pmap_free_zero_pages(&free);
 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
 		if (va < VM_MAXUSER_ADDRESS)
 			pmap_resident_count_inc(pmap, 1);
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpde & PG_A) != 0,
 	    ("pmap_demote_pde: oldpde is missing PG_A"));
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pde: oldpde is missing PG_M"));
 	newpte = oldpde & ~PG_PS;
 	newpte = pmap_swap_pat(pmap, newpte);
 
 	/*
 	 * If the page table page is new, initialize it.
 	 */
 	if (mpte->wire_count == 1) {
 		mpte->wire_count = NPTEPG;
 		pmap_fill_ptp(firstpte, newpte);
 	}
 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 	    ("pmap_demote_pde: firstpte and newpte map different physical"
 	    " addresses"));
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */
 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 		pmap_fill_ptp(firstpte, newpte);
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
 	 * of the PDE and the PV lists will be inconsistent, which can result
 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
 	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
 	 * PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 
 	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below. 
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else
 		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
 	 */
 	if (va >= VM_MAXUSER_ADDRESS)
 		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
 
 	atomic_add_long(&pmap_pde_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
  */
 static void
 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL)
 		panic("pmap_remove_kernel_pde: Missing pt page.");
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * Initialize the page table page.
 	 */
 	pagezero((void *)PHYS_TO_DMAP(mptepa));
 
 	/*
 	 * Demote the mapping.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else
 		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 }
 
 /*
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static int
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_remove_pde: sva is not 2mpage aligned"));
 	oldpde = pte_load_clear(pdq);
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 
 	/*
 	 * When workaround_erratum383 is false, a promotion to a 2M
 	 * page mapping does not invalidate the 512 4K page mappings
 	 * from the TLB.  Consequently, at this point, the TLB may
 	 * hold both 4K and 2M page mappings.  Therefore, the entire
 	 * range of addresses must be invalidated here.  In contrast,
 	 * when workaround_erratum383 is true, a promotion does
 	 * invalidate the 512 4K page mappings, and so a single INVLPG
 	 * suffices to invalidate the 2M page mapping.
 	 */
 	if ((oldpde & PG_G) != 0) {
 		if (workaround_erratum383)
 			pmap_invalidate_page(kernel_pmap, sva);
 		else
 			pmap_invalidate_range(kernel_pmap, sva,
 			    sva + NBPDR - 1);
 	}
 
 	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 	if (oldpde & PG_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 			pmap_delayed_invl_page(m);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_pde(pmap, pdq, sva);
 	} else {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(mpte->wire_count == NPTEPG,
 			    ("pmap_remove_pde: pte page wire count error"));
 			mpte->wire_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 	vm_page_t m;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 		pmap_delayed_invl_page(m);
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     struct spglist *free)
 {
 	struct rwlock *lock;
 	pt_entry_t *pte, PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((*pde & PG_V) == 0)
 		return;
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
 	lock = NULL;
 	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct rwlock *lock;
 	vm_offset_t va, va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte, PG_G, PG_V;
 	struct spglist free;
 	int anyvalid;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 	SLIST_INIT(&free);
 
 	pmap_delayed_invl_started();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pde = pmap_pde(pmap, sva);
 		if (pde && (*pde & PG_PS) == 0) {
 			pmap_remove_page(pmap, sva, pde, &free);
 			goto out;
 		}
 	}
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_remove_pde().
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
 				pmap_remove_pde(pmap, pde, sva, &free, &lock);
 				continue;
 			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
 			    &lock)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			} else
 				ptpaddr = *pde;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		va = va_next;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if (*pte == 0) {
 				if (va != va_next) {
 					pmap_invalidate_range(pmap, va, sva);
 					va = va_next;
 				}
 				continue;
 			}
 			if ((*pte & PG_G) == 0)
 				anyvalid = 1;
 			else if (va == va_next)
 				va = sva;
 			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
 			    &lock)) {
 				sva += PAGE_SIZE;
 				break;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finished();
 	pmap_free_zero_pages(&free);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	struct spglist free;
 	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry:
 	rw_wlock(lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pmap_resident_count_dec(pmap, 1);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(lock);
 	pmap_delayed_invl_wait(m);
 	pmap_free_zero_pages(&free);
 }
 
 /*
  * pmap_protect_pde: do the things to protect a 2mpage in a process
  */
 static boolean_t
 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 {
 	pd_entry_t newpde, oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m;
 	boolean_t anychanged;
 	pt_entry_t PG_G, PG_M, PG_RW;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_protect_pde: sva is not 2mpage aligned"));
 	anychanged = FALSE;
 retry:
 	oldpde = newpde = *pde;
 	if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 	    (PG_MANAGED | PG_M | PG_RW)) {
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++)
 			vm_page_dirty(m);
 	}
 	if ((prot & VM_PROT_WRITE) == 0)
 		newpde &= ~(PG_RW | PG_M);
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (newpde != oldpde) {
 		if (!atomic_cmpset_long(pde, oldpde, newpde))
 			goto retry;
 		if (oldpde & PG_G) {
 			/* See pmap_remove_pde() for explanation. */
 			if (workaround_erratum383)
 				pmap_invalidate_page(kernel_pmap, sva);
 			else
 				pmap_invalidate_range(kernel_pmap, sva,
 				    sva + NBPDR - 1);
 		} else
 			anychanged = TRUE;
 	}
 	return (anychanged);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 	boolean_t anychanged;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	anychanged = FALSE;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_protect_pde().
 				 */
 				if (pmap_protect_pde(pmap, pde, sva, prot))
 					anychanged = TRUE;
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			pt_entry_t obits, pbits;
 			vm_page_t m;
 
 retry:
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
 			}
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_long(pte, obits, pbits))
 					goto retry;
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = TRUE;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Tries to promote the 512, contiguous 4KB page mappings that are within a
  * single page table page (PTP) to a single 2MB page mapping.  For promotion
  * to occur, two conditions must be met: (1) the 4KB page mappings must map
  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  * identical characteristics. 
  */
 static void
 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
 	vm_page_t mpte;
 	int PG_PTE_CACHE;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 2MB page. 
 	 */
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 setpde:
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
 		 */
 		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 			goto setpde;
 		newpde &= ~PG_RW;
 	}
 
 	/*
 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
 	 * PTE maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE.
 	 */
 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 setpte:
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
 			 */
 			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 				goto setpte;
 			oldpte &= ~PG_RW;
 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
 			    (va & ~PDRMASK), pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the PDE
 	 * mapping the superpage is demoted by pmap_demote_pde() or
 	 * destroyed by pmap_remove_pde(). 
 	 */
 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
 	    ("pmap_promote_pde: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte)) {
 		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP,
 		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 
 	/*
 	 * Propagate the PAT index to its proper position.
 	 */
 	newpde = pmap_swap_pat(pmap, newpde);
 
 	/*
 	 * Map the superpage.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 	else
 		pde_store(pde, PG_PS | newpde);
 
 	atomic_add_long(&pmap_pde_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  *
  *	When destroying both a page table and PV entry, this function
  *	performs the TLB invalidation before releasing the PV list
  *	lock, so we do not need pmap_delayed_invl_page() calls here.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind __unused)
 {
 	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	boolean_t nosleep;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	va = trunc_page(va);
 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 	    va));
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
 	    va >= kmi.clean_eva,
 	    ("pmap_enter: managed mapping within the clean submap"));
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	pa = VM_PAGE_TO_PHYS(m);
 	newpte = (pt_entry_t)(pa | PG_A | PG_V);
 	if ((flags & VM_PROT_WRITE) != 0)
 		newpte |= PG_M;
 	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
 
 	/*
 	 * Set modified bit gratuitously for writeable mappings if
 	 * the page is unmanaged. We do not want to take a fault
 	 * to do the dirty bit accounting for these mappings.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		if ((newpte & PG_RW) != 0)
 			newpte |= PG_M;
 	} else
 		newpte |= PG_MANAGED;
 
 	mpte = NULL;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 retry:
 	pde = pmap_pde(pmap, va);
 	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
 	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
 		pte = pmap_pde_to_pte(pde, va);
 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 			mpte->wire_count++;
 		}
 	} else if (va < VM_MAXUSER_ADDRESS) {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
 		    nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			if (lock != NULL)
 				rw_wunlock(lock);
 			PMAP_UNLOCK(pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		goto retry;
 	} else
 		panic("pmap_enter: invalid page directory va=%#lx", va);
 
 	origpte = *pte;
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if ((origpte & PG_V) != 0) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		opa = origpte & PG_FRAME;
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((origpte & PG_MANAGED) != 0 &&
 			    (newpte & PG_RW) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 				goto unchanged;
 			goto validate;
 		}
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((newpte & PG_W) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((newpte & PG_MANAGED) != 0) {
 		pv = get_pv_entry(pmap, &lock);
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((newpte & PG_RW) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Update the PTE.
 	 */
 	if ((origpte & PG_V) != 0) {
 validate:
 		origpte = pte_load_store(pte, newpte);
 		opa = origpte & PG_FRAME;
 		if (opa != pa) {
 			if ((origpte & PG_MANAGED) != 0) {
 				om = PHYS_TO_VM_PAGE(opa);
 				if ((origpte & (PG_M | PG_RW)) == (PG_M |
 				    PG_RW))
 					vm_page_dirty(om);
 				if ((origpte & PG_A) != 0)
 					vm_page_aflag_set(om, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 				pmap_pvh_free(&om->md, pmap, va);
 				if ((om->aflags & PGA_WRITEABLE) != 0 &&
 				    TAILQ_EMPTY(&om->md.pv_list) &&
 				    ((om->flags & PG_FICTITIOUS) != 0 ||
 				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 					vm_page_aflag_clear(om, PGA_WRITEABLE);
 			}
 		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
 		    PG_RW)) == (PG_M | PG_RW)) {
 			if ((origpte & PG_MANAGED) != 0)
 				vm_page_dirty(m);
 
 			/*
 			 * Although the PTE may still have PG_RW set, TLB
 			 * invalidation may nonetheless be required because
 			 * the PTE no longer has PG_M set.
 			 */
 		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 			/*
 			 * This PTE change does not require TLB invalidation.
 			 */
 			goto unchanged;
 		}
 		if ((origpte & PG_A) != 0)
 			pmap_invalidate_page(pmap, va);
 	} else
 		pte_store(pte, newpte);
 
 unchanged:
 
 	/*
 	 * If both the page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pde(pmap, pde, va, &lock);
 
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (KERN_SUCCESS);
 }
 
 /*
  * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
  * otherwise.  Fails if (1) a page table page cannot be allocated without
  * blocking, (2) a mapping already exists at the specified virtual address, or
  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
  */
 static boolean_t
 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t *pde, newpde;
 	pt_entry_t PG_V;
 	vm_page_t mpde;
 	struct spglist free;
 
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
 	pde = &pde[pmap_pde_index(va)];
 	if ((*pde & PG_V) != 0) {
 		KASSERT(mpde->wire_count > 1,
 		    ("pmap_enter_pde: mpde's wire count is too low"));
 		mpde->wire_count--;
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 	    PG_PS | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		newpde |= PG_MANAGED;
 
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
 		    lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-
 				 * structure caches could nonetheless have
 				 * entries that refer to the freed page table
 				 * pages.  Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				pmap_free_zero_pages(&free);
 			}
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpde |= PG_U;
 
 	/*
 	 * Increment counters.
 	 */
 	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 
 	/*
 	 * Map the superpage.
 	 */
 	pde_store(pde, newpde);
 
 	atomic_add_long(&pmap_pde_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_pde(pmap, va, m, prot, &lock))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 			    mpte, &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	struct spglist free;
 	pt_entry_t *pte, PG_V;
 	vm_paddr_t pa;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t ptepindex;
 		pd_entry_t *ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = pmap_pde_pindex(va);
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap_pde(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
 					return (NULL);
 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		pte = &pte[pmap_pte_index(va)];
 	} else {
 		mpte = NULL;
 		pte = vtopte(va);
 	}
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-
 				 * structure caches could nonetheless have
 				 * entries that refer to the freed page table
 				 * pages.  Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				pmap_free_zero_pages(&free);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		pa |= pg_nx;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 	return (mpte);
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 	pd_entry_t *pde;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p, pdpg;
 	int pat_mode;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 		if (!pmap_ps_enabled(pmap))
 			return;
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("pmap_object_init_pt: invalid page %p", p));
 		pat_mode = p->md.pat_mode;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 2MB page boundary.
 		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		/*
 		 * Skip the first page.  Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_object_init_pt: invalid page %p", p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    pat_mode != p->md.pat_mode)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		/*
 		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
 		 * will not affect the termination of this loop.
 		 */ 
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 		    pa < ptepa + size; pa += NBPDR) {
 			pdpg = pmap_allocpde(pmap, addr, NULL);
 			if (pdpg == NULL) {
 				/*
 				 * The creation of mappings below is only an
 				 * optimization.  If a page directory page
 				 * cannot be allocated without blocking,
 				 * continue on to the next mapping rather than
 				 * blocking.
 				 */
 				addr += NBPDR;
 				continue;
 			}
 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 			pde = &pde[pmap_pde_index(addr)];
 			if ((*pde & PG_V) == 0) {
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 				atomic_add_long(&pmap_pde_mappings, 1);
 			} else {
 				/* Continue on if the PDE is already valid. */
 				pdpg->wire_count--;
 				KASSERT(pdpg->wire_count > 0,
 				    ("pmap_object_init_pt: missing reference "
 				    "to page directory page, va: 0x%lx", addr));
 			}
 			addr += NBPDR;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware
  *	feature, so there is no need to invalidate any TLB entries.
  *	Since pmap_demote_pde() for the wired entry must never fail,
  *	pmap_delayed_invl_started()/finished() calls around the
  *	function are not needed.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		if ((*pde & PG_V) == 0)
 			continue;
 		if ((*pde & PG_PS) != 0) {
 			if ((*pde & PG_W) == 0)
 				panic("pmap_unwire: pde %#jx is missing PG_W",
 				    (uintmax_t)*pde);
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				atomic_clear_long(pde, PG_W);
 				pmap->pm_stats.wired_count -= NBPDR /
 				    PAGE_SIZE;
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, sva))
 				panic("pmap_unwire: demotion failed");
 		}
 		if (va_next > eva)
 			va_next = eva;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & PG_V) == 0)
 				continue;
 			if ((*pte & PG_W) == 0)
 				panic("pmap_unwire: pte %#jx is missing PG_W",
 				    (uintmax_t)*pte);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 */
 			atomic_clear_long(pte, PG_W);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct rwlock *lock;
 	struct spglist free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t va_next;
 	pt_entry_t PG_A, PG_M, PG_V;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (dst_pmap->pm_type != src_pmap->pm_type)
 		return;
 
 	/*
 	 * EPT page table entries that require emulation of A/D bits are
 	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
 	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
 	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
 	 * implementations flag an EPT misconfiguration for exec-only
 	 * mappings we skip this function entirely for emulated pmaps.
 	 */
 	if (pmap_emulate_ad_bits(dst_pmap))
 		return;
 
 	lock = NULL;
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 
 	PG_A = pmap_accessed_bit(dst_pmap);
 	PG_M = pmap_modified_bit(dst_pmap);
 	PG_V = pmap_valid_bit(dst_pmap);
 
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpde, dstmpte, srcmpte;
 		pml4_entry_t *pml4e;
 		pdp_entry_t *pdpe;
 		pd_entry_t srcptepaddr, *pde;
 
 		KASSERT(addr < UPT_MIN_ADDRESS,
 		    ("pmap_copy: invalid to pmap_copy page tables"));
 
 		pml4e = pmap_pml4e(src_pmap, addr);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (addr + NBPML4) & ~PML4MASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (addr + NBPDP) & ~PDPMASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		va_next = (addr + NBPDR) & ~PDRMASK;
 		if (va_next < addr)
 			va_next = end_addr;
 
 		pde = pmap_pdpe_to_pde(pdpe, addr);
 		srcptepaddr = *pde;
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 				continue;
 			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
 			if (dstmpde == NULL)
 				break;
 			pde = (pd_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
 			pde = &pde[pmap_pde_index(addr)];
 			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
 			    PG_PS_FRAME, &lock))) {
 				*pde = srcptepaddr & ~PG_W;
 				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
 				atomic_add_long(&pmap_pde_mappings, 1);
 			} else
 				dstmpde->wire_count--;
 			continue;
 		}
 
 		srcptepaddr &= PG_FRAME;
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		KASSERT(srcmpte->wire_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 
 		if (va_next > end_addr)
 			va_next = end_addr;
 
 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 		src_pte = &src_pte[pmap_pte_index(addr)];
 		dstmpte = NULL;
 		while (addr < va_next) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				if (dstmpte != NULL &&
 				    dstmpte->pindex == pmap_pde_pindex(addr))
 					dstmpte->wire_count++;
 				else if ((dstmpte = pmap_allocpte(dst_pmap,
 				    addr, NULL)) == NULL)
 					goto out;
 				dst_pte = (pt_entry_t *)
 				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 				dst_pte = &dst_pte[pmap_pte_index(addr)];
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
 				    &lock)) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_W | PG_M |
 					    PG_A);
 					pmap_resident_count_inc(dst_pmap, 1);
 				} else {
 					SLIST_INIT(&free);
 					if (pmap_unwire_ptp(dst_pmap, addr,
 					    dstmpte, &free)) {
 						/*
 						 * Although "addr" is not
 						 * mapped, paging-structure
 						 * caches could nonetheless
 						 * have entries that refer to
 						 * the freed page table pages.
 						 * Invalidate those entries.
 						 */
 						pmap_invalidate_page(dst_pmap,
 						    addr);
 						pmap_free_zero_pages(&free);
 					}
 					goto out;
 				}
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 /*
  * Zero the specified hardware page.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  * Zero an an area within a single hardware page.  off and size must not
  * cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  * Copy 1 specified hardware page to another.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t pages[2];
 	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
 	int cnt;
 	boolean_t mapped;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		pages[0] = ma[a_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
 		pages[1] = mb[b_offset >> PAGE_SHIFT];
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
 		a_cp = (char *)vaddr[0] + a_pg_offset;
 		b_cp = (char *)vaddr[1] + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
 		if (__predict_false(mapped))
 			pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct rwlock *lock;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 	int count, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		if ((*pte & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pde(pmap, pv->pv_va);
 			if ((*pte & PG_W) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	return (count);
 }
 
 /*
  * Returns TRUE if the given page is mapped individually or as part of
  * a 2mpage.  Otherwise, returns FALSE.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pd_entry_t ptepde;
 	pt_entry_t *pte, tpte;
 	pt_entry_t PG_M, PG_RW, PG_V;
 	struct spglist free;
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx;
 	boolean_t superpage;
 	vm_paddr_t pa;
 
 	/*
 	 * Assert that the given pmap is only active on the current
 	 * CPU.  Unfortunately, we cannot block another CPU from
 	 * activating the pmap while this function is executing.
 	 */
 	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
 #ifdef INVARIANTS
 	{
 		cpuset_t other_cpus;
 
 		other_cpus = all_cpus;
 		critical_enter();
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		critical_exit();
 		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
 	}
 #endif
 
 	lock = NULL;
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	SLIST_INIT(&free);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfq(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_pdpe(pmap, pv->pv_va);
 				ptepde = *pte;
 				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 				tpte = *pte;
 				if ((tpte & (PG_PS | PG_V)) == PG_V) {
 					superpage = FALSE;
 					ptepde = tpte;
 					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 					    PG_FRAME);
 					pte = &pte[pmap_pte_index(pv->pv_va)];
 					tpte = *pte;
 				} else {
 					/*
 					 * Keep track whether 'tpte' is a
 					 * superpage explicitly instead of
 					 * relying on PG_PS being set.
 					 *
 					 * This is because PG_PS is numerically
 					 * identical to PG_PTE_PAT and thus a
 					 * regular page could be mistaken for
 					 * a superpage.
 					 */
 					superpage = TRUE;
 				}
 
 				if ((tpte & PG_V) == 0) {
 					panic("bad pte va %lx pte %lx",
 					    pv->pv_va, tpte);
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				if (superpage)
 					pa = tpte & PG_PS_FRAME;
 				else
 					pa = tpte & PG_FRAME;
 
 				m = PHYS_TO_VM_PAGE(pa);
 				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad tpte %#jx",
 				    (uintmax_t)tpte));
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 					if (superpage) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
 						vm_page_dirty(m);
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 				if (superpage) {
 					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 					if (mpte != NULL) {
 						pmap_resident_count_dec(pmap, 1);
 						KASSERT(mpte->wire_count == NPTEPG,
 						    ("pmap_remove_pages: pte page wire count error"));
 						mpte->wire_count = 0;
 						pmap_add_delayed_free_list(mpte, &free, FALSE);
 						atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 					}
 				} else {
 					pmap_resident_count_dec(pmap, 1);
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 					m->md.pv_gen++;
 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m, PGA_WRITEABLE);
 					}
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(&free);
 }
 
 static boolean_t
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct rwlock *lock;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	pt_entry_t *pte, mask;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	pmap_t pmap;
 	int md_gen, pvh_gen;
 	boolean_t rv;
 
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		mask = 0;
 		if (modified) {
 			PG_M = pmap_modified_bit(pmap);
 			PG_RW = pmap_rw_bit(pmap);
 			mask |= PG_RW | PG_M;
 		}
 		if (accessed) {
 			PG_A = pmap_accessed_bit(pmap);
 			PG_V = pmap_valid_bit(pmap);
 			mask |= PG_V | PG_A;
 		}
 		rv = (*pte & mask) == mask;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pde(pmap, pv->pv_va);
 			mask = 0;
 			if (modified) {
 				PG_M = pmap_modified_bit(pmap);
 				PG_RW = pmap_rw_bit(pmap);
 				mask |= PG_RW | PG_M;
 			}
 			if (accessed) {
 				PG_A = pmap_accessed_bit(pmap);
 				PG_V = pmap_valid_bit(pmap);
 				mask |= PG_V | PG_A;
 			}
 			rv = (*pte & mask) == mask;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	boolean_t rv;
 
 	PG_V = pmap_valid_bit(pmap);
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
 	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 		pte = pmap_pde_to_pte(pde, addr);
 		rv = (*pte & PG_V) == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *pde;
 	pt_entry_t oldpte, *pte, PG_M, PG_RW;
 	vm_offset_t va;
 	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		if ((*pde & PG_RW) != 0)
 			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen ||
 			    md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if (oldpte & PG_RW) {
 			if (!atomic_cmpset_long(pte, oldpte, oldpte &
 			    ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	pmap_delayed_invl_wait(m);
 }
 
 static __inline boolean_t
 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
 {
 
 	if (!pmap_emulate_ad_bits(pmap))
 		return (TRUE);
 
 	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
 
 	/*
 	 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
 	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
 	 * if the EPT_PG_WRITE bit is set.
 	 */
 	if ((pte & EPT_PG_WRITE) != 0)
 		return (FALSE);
 
 	/*
 	 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
 	 */
 	if ((pte & EPT_PG_EXECUTE) == 0 ||
 	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  *
  *	A DI block is not needed within this function, because
  *	invalidations are performed before the PV list lock is
  *	released.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A, PG_M, PG_RW;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int cleared, md_gen, not_cleared, pvh_gen;
 	struct spglist free;
 	boolean_t demoted;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, pv->pv_va);
 		oldpde = *pde;
 		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Although "oldpde" is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((oldpde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
 			 * pages, it should not be cleared every time it is
 			 * tested.  Apply a simple "hash" function on the
 			 * physical page number, the virtual superpage number,
 			 * and the pmap address to select one 4KB page out of
 			 * the 512 on which testing the reference bit will
 			 * result in clearing that reference bit.  This
 			 * function is designed to avoid the selection of the
 			 * same 4KB page for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 			    (oldpde & PG_W) == 0) {
 				if (safe_to_clear_referenced(pmap, oldpde)) {
 					atomic_clear_long(pde, PG_A);
 					pmap_invalidate_page(pmap, pv->pv_va);
 					demoted = FALSE;
 				} else if (pmap_demote_pde_locked(pmap, pde,
 				    pv->pv_va, &lock)) {
 					/*
 					 * Remove the mapping to a single page
 					 * so that a subsequent access may
 					 * repromote.  Since the underlying
 					 * page table page is fully populated,
 					 * this removal never frees a page
 					 * table page.
 					 */
 					demoted = TRUE;
 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
 					    PG_PS_FRAME);
 					pte = pmap_pde_to_pte(pde, va);
 					pmap_remove_pte(pmap, pte, va, *pde,
 					    NULL, &lock);
 					pmap_invalidate_page(pmap, va);
 				} else
 					demoted = TRUE;
 
 				if (demoted) {
 					/*
 					 * The superpage mapping was removed
 					 * entirely and therefore 'pv' is no
 					 * longer valid.
 					 */
 					if (pvf == pv)
 						pvf = NULL;
 					pv = NULL;
 				}
 				cleared++;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if ((*pte & PG_A) != 0) {
 			if (safe_to_clear_referenced(pmap, *pte)) {
 				atomic_clear_long(pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else if ((*pte & PG_W) == 0) {
 				/*
 				 * Wired pages cannot be paged out so
 				 * doing accessed bit emulation for
 				 * them is wasted effort. We do the
 				 * hard work for unwired pages only.
 				 */
 				pmap_remove_pte(pmap, pte, pv->pv_va,
 				    *pde, &free, &lock);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 				if (pvf == pv)
 					pvf = NULL;
 				pv = NULL;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	pmap_free_zero_pages(&free);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	struct rwlock *lock;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
 	vm_offset_t va, va_next;
 	vm_page_t m;
 	boolean_t anychanged;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 
 	/*
 	 * A/D bit emulation requires an alternate code path when clearing
 	 * the modified and accessed bits below. Since this function is
 	 * advisory in nature we skip it entirely for pmaps that require
 	 * A/D bit emulation.
 	 */
 	if (pmap_emulate_ad_bits(pmap))
 		return;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	anychanged = FALSE;
 	pmap_delayed_invl_started();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		oldpde = *pde;
 		if ((oldpde & PG_V) == 0)
 			continue;
 		else if ((oldpde & PG_PS) != 0) {
 			if ((oldpde & PG_MANAGED) == 0)
 				continue;
 			lock = NULL;
 			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
 				if (lock != NULL)
 					rw_wunlock(lock);
 
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Since the underlying page
 			 * table page is fully populated, this removal never
 			 * frees a page table page.
 			 */
 			if ((oldpde & PG_W) == 0) {
 				pte = pmap_pde_to_pte(pde, sva);
 				KASSERT((*pte & PG_V) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
 				    &lock);
 				anychanged = TRUE;
 			}
 			if (lock != NULL)
 				rw_wunlock(lock);
 		}
 		if (va_next > eva)
 			va_next = eva;
 		va = va_next;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 				goto maybe_invlrng;
 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				atomic_clear_long(pte, PG_M | PG_A);
 			} else if ((*pte & PG_A) != 0)
 				atomic_clear_long(pte, PG_A);
 			else
 				goto maybe_invlrng;
 
 			if ((*pte & PG_G) != 0) {
 				if (va == va_next)
 					va = sva;
 			} else
 				anychanged = TRUE;
 			continue;
 maybe_invlrng:
 			if (va != va_next) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = va_next;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finished();
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
 	struct rwlock *lock;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_V = pmap_valid_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		oldpde = *pde;
 		if ((oldpde & PG_RW) != 0) {
 			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
 				if ((oldpde & PG_W) == 0) {
 					/*
 					 * Write protect the mapping to a
 					 * single page so that a subsequent
 					 * write access may repromote.
 					 */
 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
 					    PG_PS_FRAME);
 					pte = pmap_pde_to_pte(pde, va);
 					oldpte = *pte;
 					if ((oldpte & PG_V) != 0) {
 						while (!atomic_cmpset_long(pte,
 						    oldpte,
 						    oldpte & ~(PG_M | PG_RW)))
 							oldpte = *pte;
 						vm_page_dirty(m);
 						pmap_invalidate_page(pmap, va);
 					}
 				}
 			}
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			atomic_clear_long(pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
 {
 	u_int opte, npte;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = *(u_int *)pte;
 		npte = opte & ~mask;
 		npte |= cache_bits;
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
 
 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 static __inline void
 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
 {
 	u_int opde, npde;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PDE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opde = *(u_int *)pde;
 		npde = opde & ~mask;
 		npde |= cache_bits;
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	vm_size_t tmpsize;
 	int i;
 
 	offset = pa & PAGE_MASK;
 	size = round_page(offset + size);
 	pa = trunc_page(pa);
 
 	if (!pmap_initialized) {
 		va = 0;
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->va == 0) {
 				ppim->pa = pa;
 				ppim->sz = size;
 				ppim->mode = mode;
 				ppim->va = virtual_avail;
 				virtual_avail += size;
 				va = ppim->va;
 				break;
 			}
 		}
 		if (va == 0)
 			panic("%s: too many preinit mappings", __func__);
 	} else {
 		/*
 		 * If we have a preinit mapping, re-use it.
 		 */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->pa == pa && ppim->sz == size &&
 			    ppim->mode == mode)
 				return ((void *)(ppim->va + offset));
 		}
 		/*
 		 * If the specified range of physical addresses fits within
 		 * the direct map window, use the direct map.
 		 */
 		if (pa < dmaplimit && pa + size < dmaplimit) {
 			va = PHYS_TO_DMAP(pa);
 			if (!pmap_change_attr(va, size, mode))
 				return ((void *)(va + offset));
 		}
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 	}
 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 	pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset;
 	int i;
 
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return;
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 	va = trunc_page(va);
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va && ppim->sz == size) {
 			if (pmap_initialized)
 				return;
 			ppim->pa = 0;
 			ppim->va = 0;
 			ppim->sz = 0;
 			ppim->mode = 0;
 			if (va + size == virtual_avail)
 				virtual_avail = va;
 			return;
 		}
 	}
 	if (pmap_initialized)
 		kva_free(va, size);
 }
 
 /*
  * Tries to demote a 1GB page mapping.
  */
 static boolean_t
 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pdp_entry_t newpdpe, oldpdpe;
 	pd_entry_t *firstpde, newpde, *pde;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t mpdepa;
 	vm_page_t mpde;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpdpe = *pdpe;
 	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	mpdepa = VM_PAGE_TO_PHYS(mpde);
 	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
 	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpdpe & PG_A) != 0,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 	newpde = oldpdpe;
 
 	/*
 	 * Initialize the page directory page.
 	 */
 	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 		*pde = newpde;
 		newpde += NBPDR;
 	}
 
 	/*
 	 * Demote the mapping.
 	 */
 	*pdpe = newpdpe;
 
 	/*
 	 * Invalidate a stale recursive mapping of the page directory page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
 
 	pmap_pdpe_demotions++;
 	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pat_mode = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 	    m->md.pat_mode))
 		panic("memory attribute change on the direct map failed");
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the direct map or the kernel map.  If
  * the virtual address range is contained within the kernel map, then the
  * memory type for each of the corresponding ranges of the direct map is also
  * changed.  (The corresponding ranges of the direct map are those ranges that
  * map the same physical pages as the specified virtual address range.)  These
  * changes to the direct map are necessary because Intel describes the
  * behavior of their processors as "undefined" if two or more mappings to the
  * same physical page have different memory types.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.  In the
  * latter case, the memory type may have been changed on some part of the
  * virtual address range or the direct map.
  */
 int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	int error;
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_attr_locked(va, size, mode);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	vm_paddr_t pa_start, pa_end, pa_end1;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	int cache_bits_pte, cache_bits_pde, error;
 	boolean_t changed;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses, including the direct
 	 * map but excluding the recursive map.
 	 */
 	if (base < DMAP_MIN_ADDRESS)
 		return (EINVAL);
 
 	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
 	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
 	changed = FALSE;
 
 	/*
 	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 	 * into 4KB pages if required.
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (pdpe == NULL || *pdpe == 0)
 			return (EINVAL);
 		if (*pdpe & PG_PS) {
 			/*
 			 * If the current 1GB page already has the required
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 1GB page frame.
 			 */
 			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_1gpage(tmpva) + NBPDP;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 1GB page frame
 			 * and there is at least 1GB left within the range, then
 			 * we need not break down this page into 2MB pages.
 			 */
 			if ((tmpva & PDPMASK) == 0 &&
 			    tmpva + PDPMASK < base + size) {
 				tmpva += NBPDP;
 				continue;
 			}
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
 				return (ENOMEM);
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde == 0)
 			return (EINVAL);
 		if (*pde & PG_PS) {
 			/*
 			 * If the current 2MB page already has the required
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 2MB page frame.
 			 */
 			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_2mpage(tmpva) + NBPDR;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 2MB page frame
 			 * and there is at least 2MB left within the range, then
 			 * we need not break down this page into 4KB pages.
 			 */
 			if ((tmpva & PDRMASK) == 0 &&
 			    tmpva + PDRMASK < base + size) {
 				tmpva += NBPDR;
 				continue;
 			}
 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
 				return (ENOMEM);
 		}
 		pte = pmap_pde_to_pte(pde, tmpva);
 		if (*pte == 0)
 			return (EINVAL);
 		tmpva += PAGE_SIZE;
 	}
 	error = 0;
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode if required.
 	 */
 	pa_start = pa_end = 0;
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (*pdpe & PG_PS) {
 			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pdpe, cache_bits_pde,
 				    X86_PG_PDE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
 					pa_end = pa_start + NBPDP;
 				} else if (pa_end == (*pdpe & PG_PS_FRAME))
 					pa_end += NBPDP;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
 					pa_end = pa_start + NBPDP;
 				}
 			}
 			tmpva = trunc_1gpage(tmpva) + NBPDP;
 			continue;
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde & PG_PS) {
 			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pde, cache_bits_pde,
 				    X86_PG_PDE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pde & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
 					pa_end = pa_start + NBPDR;
 				} else if (pa_end == (*pde & PG_PS_FRAME))
 					pa_end += NBPDR;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
 					pa_end = pa_start + NBPDR;
 				}
 			}
 			tmpva = trunc_2mpage(tmpva) + NBPDR;
 		} else {
 			pte = pmap_pde_to_pte(pde, tmpva);
 			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
 				pmap_pte_attr(pte, cache_bits_pte,
 				    X86_PG_PTE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pte & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				} else if (pa_end == (*pte & PG_FRAME))
 					pa_end += PAGE_SIZE;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				}
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
 		pa_end1 = MIN(pa_end, dmaplimit);
 		if (pa_start != pa_end1)
 			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
 			    pa_end1 - pa_start, mode);
 	}
 
 	/*
 	 * Flush CPU caches if required to make sure any data isn't cached that
 	 * shouldn't be, etc.
 	 */
 	if (changed) {
 		pmap_invalidate_range(kernel_pmap, base, tmpva);
 		pmap_invalidate_cache_range(base, tmpva, FALSE);
 	}
 	return (error);
 }
 
 /*
  * Demotes any mapping within the direct map region that covers more than the
  * specified range of physical addresses.  This range's size must be a power
  * of two and its starting address must be a multiple of its size.  Since the
  * demotion does not change any attributes of the mapping, a TLB invalidation
  * is not mandatory.  The caller may, however, request a TLB invalidation.
  */
 void
 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	boolean_t changed;
 
 	if (len == 0)
 		return;
 	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
 	KASSERT((base & (len - 1)) == 0,
 	    ("pmap_demote_DMAP: base is not a multiple of len"));
 	if (len < NBPDP && base < dmaplimit) {
 		va = PHYS_TO_DMAP(base);
 		changed = FALSE;
 		PMAP_LOCK(kernel_pmap);
 		pdpe = pmap_pdpe(kernel_pmap, va);
 		if ((*pdpe & X86_PG_V) == 0)
 			panic("pmap_demote_DMAP: invalid PDPE");
 		if ((*pdpe & PG_PS) != 0) {
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
 				panic("pmap_demote_DMAP: PDPE failed");
 			changed = TRUE;
 		}
 		if (len < NBPDR) {
 			pde = pmap_pdpe_to_pde(pdpe, va);
 			if ((*pde & X86_PG_V) == 0)
 				panic("pmap_demote_DMAP: invalid PDE");
 			if ((*pde & PG_PS) != 0) {
 				if (!pmap_demote_pde(kernel_pmap, pde, va))
 					panic("pmap_demote_DMAP: PDE failed");
 				changed = TRUE;
 			}
 		}
 		if (changed && invalidate)
 			pmap_invalidate_page(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pd_entry_t *pdep;
 	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa;
 	int val;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, addr);
 	if (pdep != NULL && (*pdep & PG_V)) {
 		if (*pdep & PG_PS) {
 			pte = *pdep;
 			/* Compute the physical address of the 4KB page. */
 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 			    PG_FRAME;
 			val = MINCORE_SUPER;
 		} else {
 			pte = *pmap_pde_to_pte(pdep, addr);
 			pa = pte & PG_FRAME;
 			val = 0;
 		}
 	} else {
 		pte = 0;
 		pa = 0;
 		val = 0;
 	}
 	if ((pte & PG_V) != 0) {
 		val |= MINCORE_INCORE;
 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((pte & PG_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 static uint64_t
 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
 {
 	uint32_t gen, new_gen, pcid_next;
 
 	CRITICAL_ASSERT(curthread);
 	gen = PCPU_GET(pcid_gen);
 	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
 	    pmap->pm_pcids[cpuid].pm_gen == gen)
 		return (CR3_PCID_SAVE);
 	pcid_next = PCPU_GET(pcid_next);
 	KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x",
 	    cpuid, pcid_next));
 	if (pcid_next == PMAP_PCID_OVERMAX) {
 		new_gen = gen + 1;
 		if (new_gen == 0)
 			new_gen = 1;
 		PCPU_SET(pcid_gen, new_gen);
 		pcid_next = PMAP_PCID_KERN + 1;
 	} else {
 		new_gen = gen;
 	}
 	pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
 	pmap->pm_pcids[cpuid].pm_gen = new_gen;
 	PCPU_SET(pcid_next, pcid_next + 1);
 	return (0);
 }
 
 void
 pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
 	uint64_t cached, cr3;
 	register_t rflags;
 	u_int cpuid;
 
 	oldpmap = PCPU_GET(curpmap);
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	if (oldpmap == pmap)
 		return;
 	cpuid = PCPU_GET(cpuid);
 #ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 	cr3 = rcr3();
 	if (pmap_pcid_enabled) {
 		cached = pmap_pcid_alloc(pmap, cpuid);
 		KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 &&
 		    pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
 		    ("pmap %p cpu %d pcid %#x", pmap, cpuid,
 		    pmap->pm_pcids[cpuid].pm_pcid));
 		KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
 		    pmap == kernel_pmap,
 		    ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x",
 		    td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
 
 		/*
 		 * If the INVPCID instruction is not available,
 		 * invltlb_pcid_handler() is used for handle
 		 * invalidate_all IPI, which checks for curpmap ==
 		 * smp_tlb_pmap.  Below operations sequence has a
 		 * window where %CR3 is loaded with the new pmap's
 		 * PML4 address, but curpmap value is not yet updated.
 		 * This causes invltlb IPI handler, called between the
 		 * updates, to execute as NOP, which leaves stale TLB
 		 * entries.
 		 *
 		 * Note that the most typical use of
 		 * pmap_activate_sw(), from the context switch, is
 		 * immune to this race, because interrupts are
 		 * disabled (while the thread lock is owned), and IPI
 		 * happends after curpmap is updated.  Protect other
 		 * callers in a similar way, by disabling interrupts
 		 * around the %cr3 register reload and curpmap
 		 * assignment.
 		 */
 		if (!invpcid_works)
 			rflags = intr_disable();
 
 		if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) {
 			load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
 			    cached);
 			if (cached)
 				PCPU_INC(pm_save_cnt);
 		}
 		PCPU_SET(curpmap, pmap);
 		if (!invpcid_works)
 			intr_restore(rflags);
 	} else if (cr3 != pmap->pm_cr3) {
 		load_cr3(pmap->pm_cr3);
 		PCPU_SET(curpmap, pmap);
 	}
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 #endif
 }
 
 void
 pmap_activate(struct thread *td)
 {
 
 	critical_enter();
 	pmap_activate_sw(td);
 	critical_exit();
 }
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < NBPDR)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & PDRMASK;
 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 	    (*addr & PDRMASK) == superpage_offset)
 		return;
 	if ((*addr & PDRMASK) < superpage_offset)
 		*addr = (*addr & ~PDRMASK) + superpage_offset;
 	else
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
 #ifdef INVARIANTS
 static unsigned long num_dirty_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
 	     &num_dirty_emulations, 0, NULL);
 
 static unsigned long num_accessed_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
 	     &num_accessed_emulations, 0, NULL);
 
 static unsigned long num_superpage_accessed_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
 	     &num_superpage_accessed_emulations, 0, NULL);
 
 static unsigned long ad_emulation_superpage_promotions;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
 	     &ad_emulation_superpage_promotions, 0, NULL);
 #endif	/* INVARIANTS */
 
 int
 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
 {
 	int rv;
 	struct rwlock *lock;
 	vm_page_t m, mpte;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
 
 	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
 	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
 
 	if (!pmap_emulate_ad_bits(pmap))
 		return (-1);
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	rv = -1;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		goto done;
 
 	if ((*pde & PG_PS) != 0) {
 		if (ftype == VM_PROT_READ) {
 #ifdef INVARIANTS
 			atomic_add_long(&num_superpage_accessed_emulations, 1);
 #endif
 			*pde |= PG_A;
 			rv = 0;
 		}
 		goto done;
 	}
 
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		goto done;
 
 	if (ftype == VM_PROT_WRITE) {
 		if ((*pte & PG_RW) == 0)
 			goto done;
 		/*
 		 * Set the modified and accessed bits simultaneously.
 		 *
 		 * Intel EPT PTEs that do software emulation of A/D bits map
 		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
 		 * An EPT misconfiguration is triggered if the PTE is writable
 		 * but not readable (WR=10). This is avoided by setting PG_A
 		 * and PG_M simultaneously.
 		 */
 		*pte |= PG_M | PG_A;
 	} else {
 		*pte |= PG_A;
 	}
 
 	/* try to promote the mapping */
 	if (va < VM_MAXUSER_ADDRESS)
 		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	else
 		mpte = NULL;
 
 	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0) {
 		pmap_promote_pde(pmap, pde, va, &lock);
 #ifdef INVARIANTS
 		atomic_add_long(&ad_emulation_superpage_promotions, 1);
 #endif
 	}
 #ifdef INVARIANTS
 	if (ftype == VM_PROT_WRITE)
 		atomic_add_long(&num_dirty_emulations, 1);
 	else
 		atomic_add_long(&num_accessed_emulations, 1);
 #endif
 	rv = 0;		/* success */
 done:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 void
 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
 {
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	int idx;
 
 	idx = 0;
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 
 	pml4 = pmap_pml4e(pmap, va);
 	ptr[idx++] = *pml4;
 	if ((*pml4 & PG_V) == 0)
 		goto done;
 
 	pdp = pmap_pml4e_to_pdpe(pml4, va);
 	ptr[idx++] = *pdp;
 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
 		goto done;
 
 	pde = pmap_pdpe_to_pde(pdp, va);
 	ptr[idx++] = *pde;
 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
 		goto done;
 
 	pte = pmap_pde_to_pte(pde, va);
 	ptr[idx++] = *pte;
 
 done:
 	PMAP_UNLOCK(pmap);
 	*num = idx;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	pt_entry_t *pte;
 	int cache_bits, error, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(paddr >= dmaplimit)) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	/*
 	 * NB:  The sequence of updating a page table followed by accesses
 	 * to the corresponding pages used in the !DMAP case is subject to
 	 * the situation described in the "AMD64 Architecture Programmer's
 	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
 	 * Coherency Considerations".  Therefore, issuing the INVLPG right
 	 * after modifying the PTE bits is crucial.
 	 */
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= dmaplimit) {
 			if (can_fault) {
 				/*
 				 * Slow path, since we can get page faults
 				 * while mappings are active don't pin the
 				 * thread to the CPU and instead add a global
 				 * mapping visible to all CPUs.
 				 */
 				pmap_qenter(vaddr[i], &page[i], 1);
 			} else {
 				pte = vtopte(vaddr[i]);
 				cache_bits = pmap_cache_bits(kernel_pmap,
 				    page[i]->md.pat_mode, 0);
 				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
 				    cache_bits);
 				invlpg(vaddr[i]);
 			}
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= dmaplimit) {
 			if (can_fault)
 				pmap_qremove(vaddr[i], 1);
 			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
 		}
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 	vm_paddr_t paddr;
 
 	paddr = VM_PAGE_TO_PHYS(m);
 	if (paddr < dmaplimit)
 		return (PHYS_TO_DMAP(paddr));
 	mtx_lock_spin(&qframe_mtx);
 	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
 	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
 	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
 	return (qframe);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 
 	if (addr != qframe)
 		return;
 	pte_store(vtopte(qframe), 0);
 	invlpg(qframe);
 	mtx_unlock_spin(&qframe_mtx);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kdb.h>
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(pte, pmap_print_pte)
 {
 	pmap_t pmap;
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	vm_offset_t va;
 
 	if (!have_addr) {
 		db_printf("show pte addr\n");
 		return;
 	}
 	va = (vm_offset_t)addr;
 
 	if (kdb_thread != NULL)
 		pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
 	else
 		pmap = PCPU_GET(curpmap);
 
 	PG_V = pmap_valid_bit(pmap);
 	pml4 = pmap_pml4e(pmap, va);
 	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
 	if ((*pml4 & PG_V) == 0) {
 		db_printf("\n");
 		return;
 	}
 	pdp = pmap_pml4e_to_pdpe(pml4, va);
 	db_printf(" pdpe %#016lx", *pdp);
 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pde = pmap_pdpe_to_pde(pdp, va);
 	db_printf(" pde %#016lx", *pde);
 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pte = pmap_pde_to_pte(pde, va);
 	db_printf(" pte %#016lx\n", *pte);
 }
 
 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
 {
 	vm_paddr_t a;
 
 	if (have_addr) {
 		a = (vm_paddr_t)addr;
 		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
 	} else {
 		db_printf("show phys2dmap addr\n");
 	}
 }
 #endif
Index: head/sys/boot/common/md.c
===================================================================
--- head/sys/boot/common/md.c	(revision 314067)
+++ head/sys/boot/common/md.c	(revision 314068)
@@ -1,156 +1,156 @@
 /*-
  * Copyright (c) 2009 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <stand.h>
 #include <sys/param.h>
 #include <sys/endian.h>
 #include <sys/queue.h>
 #include <machine/stdarg.h>
 
 #include "bootstrap.h"
 
 #define	MD_BLOCK_SIZE	512
 
 #ifndef MD_IMAGE_SIZE
 #error Must be compiled with MD_IMAGE_SIZE defined
 #endif
 #if (MD_IMAGE_SIZE == 0 || MD_IMAGE_SIZE % MD_BLOCK_SIZE)
 #error Image size must be a multiple of 512.
 #endif
 
 /*
  * Preloaded image gets put here.
  * Applications that patch the object with the image can determine
  * the size looking at the start and end markers (strings),
  * so we want them contiguous.
  */
 static struct {
 	u_char start[MD_IMAGE_SIZE];
 	u_char end[128];
 } md_image = {
 	.start = "MFS Filesystem goes here",
 	.end = "MFS Filesystem had better STOP here",
 };
 
 /* devsw I/F */
 static int md_init(void);
 static int md_strategy(void *, int, daddr_t, size_t, char *, size_t *);
 static int md_open(struct open_file *, ...);
 static int md_close(struct open_file *);
 static int md_print(int);
 
 struct devsw md_dev = {
 	"md",
 	DEVT_DISK,
 	md_init,
 	md_strategy,
 	md_open,
 	md_close,
 	noioctl,
 	md_print
 };
 
 static int
 md_init(void)
 {
 
 	return (0);
 }
 
 static int
 md_strategy(void *devdata, int rw, daddr_t blk, size_t size,
     char *buf, size_t *rsize)
 {
 	struct devdesc *dev = (struct devdesc *)devdata;
 	size_t ofs;
 
 	if (dev->d_unit != 0)
 		return (ENXIO);
 
 	if (blk < 0 || blk >= (MD_IMAGE_SIZE / MD_BLOCK_SIZE))
 		return (EIO);
 
 	if (size % MD_BLOCK_SIZE)
 		return (EIO);
 
 	ofs = blk * MD_BLOCK_SIZE;
 	if ((ofs + size) > MD_IMAGE_SIZE)
 		size = MD_IMAGE_SIZE - ofs;
 
-	if (rsize != 0)
+	if (rsize != NULL)
 		*rsize = size;
 
 	switch (rw) {
 	case F_READ:
 		bcopy(md_image.start + ofs, buf, size);
 		return (0);
 	case F_WRITE:
 		bcopy(buf, md_image.start + ofs, size);
 		return (0);
 	}
 
 	return (ENODEV);
 }
 
 static int
 md_open(struct open_file *f, ...)
 {
 	va_list ap;
 	struct devdesc *dev;
 
 	va_start(ap, f);
 	dev = va_arg(ap, struct devdesc *);
 	va_end(ap);
 
 	if (dev->d_unit != 0)
 		return (ENXIO);
 
 	return (0);
 }
 
 static int
 md_close(struct open_file *f)
 {
 	struct devdesc *dev;
 
 	dev = (struct devdesc *)(f->f_devdata);
 	return ((dev->d_unit != 0) ? ENXIO : 0);
 }
 
 static int
 md_print(int verbose)
 {
 
 	printf("%s devices:", md_dev.dv_name);
 	if (pager_output("\n") != 0)
 		return (1);
 
 	printf("MD (%u bytes)", MD_IMAGE_SIZE);
 	return (pager_output("\n"));
 }
Index: head/sys/boot/efi/libefi/efinet.c
===================================================================
--- head/sys/boot/efi/libefi/efinet.c	(revision 314067)
+++ head/sys/boot/efi/libefi/efinet.c	(revision 314068)
@@ -1,384 +1,384 @@
 /*-
  * Copyright (c) 2001 Doug Rabson
  * Copyright (c) 2002, 2006 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 
 #include <stand.h>
 #include <net.h>
 #include <netif.h>
 
 #include <dev_net.c>
 
 #include <efi.h>
 #include <efilib.h>
 
 static EFI_GUID sn_guid = EFI_SIMPLE_NETWORK_PROTOCOL;
 
 static void efinet_end(struct netif *);
 static int efinet_get(struct iodesc *, void *, size_t, time_t);
 static void efinet_init(struct iodesc *, void *);
 static int efinet_match(struct netif *, void *);
 static int efinet_probe(struct netif *, void *);
 static int efinet_put(struct iodesc *, void *, size_t);
 
 struct netif_driver efinetif = {   
 	.netif_bname = "efinet",
 	.netif_match = efinet_match,
 	.netif_probe = efinet_probe,
 	.netif_init = efinet_init,
 	.netif_get = efinet_get,
 	.netif_put = efinet_put,
 	.netif_end = efinet_end,
 	.netif_ifs = NULL,
 	.netif_nifs = 0
 };
 
 #ifdef EFINET_DEBUG
 static void
 dump_mode(EFI_SIMPLE_NETWORK_MODE *mode)
 {
 	int i;
 
 	printf("State                 = %x\n", mode->State);
 	printf("HwAddressSize         = %u\n", mode->HwAddressSize);
 	printf("MediaHeaderSize       = %u\n", mode->MediaHeaderSize);
 	printf("MaxPacketSize         = %u\n", mode->MaxPacketSize);
 	printf("NvRamSize             = %u\n", mode->NvRamSize);
 	printf("NvRamAccessSize       = %u\n", mode->NvRamAccessSize);
 	printf("ReceiveFilterMask     = %x\n", mode->ReceiveFilterMask);
 	printf("ReceiveFilterSetting  = %u\n", mode->ReceiveFilterSetting);
 	printf("MaxMCastFilterCount   = %u\n", mode->MaxMCastFilterCount);
 	printf("MCastFilterCount      = %u\n", mode->MCastFilterCount);
 	printf("MCastFilter           = {");
 	for (i = 0; i < mode->MCastFilterCount; i++)
 		printf(" %s", ether_sprintf(mode->MCastFilter[i].Addr));
 	printf(" }\n");
 	printf("CurrentAddress        = %s\n",
 	    ether_sprintf(mode->CurrentAddress.Addr));
 	printf("BroadcastAddress      = %s\n",
 	    ether_sprintf(mode->BroadcastAddress.Addr));
 	printf("PermanentAddress      = %s\n",
 	    ether_sprintf(mode->PermanentAddress.Addr));
 	printf("IfType                = %u\n", mode->IfType);
 	printf("MacAddressChangeable  = %d\n", mode->MacAddressChangeable);
 	printf("MultipleTxSupported   = %d\n", mode->MultipleTxSupported);
 	printf("MediaPresentSupported = %d\n", mode->MediaPresentSupported);
 	printf("MediaPresent          = %d\n", mode->MediaPresent);
 }
 #endif
 
 static int
 efinet_match(struct netif *nif, void *machdep_hint)
 {
 	struct devdesc *dev = machdep_hint;
 
 	if (dev->d_unit == nif->nif_unit)
 		return (1);
 	return(0);
 }
 
 static int
 efinet_probe(struct netif *nif, void *machdep_hint)
 {
 
 	return (0);
 }
 
 static int
 efinet_put(struct iodesc *desc, void *pkt, size_t len)
 {
 	struct netif *nif = desc->io_netif;
 	EFI_SIMPLE_NETWORK *net;
 	EFI_STATUS status;
 	void *buf;
 
 	net = nif->nif_devdata;
 	if (net == NULL)
 		return (-1);
 
 	status = net->Transmit(net, 0, len, pkt, 0, 0, 0);
 	if (status != EFI_SUCCESS)
 		return (-1);
 
 	/* Wait for the buffer to be transmitted */
 	do {
-		buf = 0;	/* XXX Is this needed? */
+		buf = NULL;	/* XXX Is this needed? */
 		status = net->GetStatus(net, 0, &buf);
 		/*
 		 * XXX EFI1.1 and the E1000 card returns a different 
 		 * address than we gave.  Sigh.
 		 */
-	} while (status == EFI_SUCCESS && buf == 0);
+	} while (status == EFI_SUCCESS && buf == NULL);
 
 	/* XXX How do we deal with status != EFI_SUCCESS now? */
 	return ((status == EFI_SUCCESS) ? len : -1);
 }
 
 static int
 efinet_get(struct iodesc *desc, void *pkt, size_t len, time_t timeout)
 {
 	struct netif *nif = desc->io_netif;
 	EFI_SIMPLE_NETWORK *net;
 	EFI_STATUS status;
 	UINTN bufsz;
 	time_t t;
 	char buf[2048];
 
 	net = nif->nif_devdata;
 	if (net == NULL)
 		return (0);
 
 	t = time(0);
 	while ((time(0) - t) < timeout) {
 		bufsz = sizeof(buf);
 		status = net->Receive(net, 0, &bufsz, buf, 0, 0, 0);
 		if (status == EFI_SUCCESS) {
 			/*
 			 * XXX EFI1.1 and the E1000 card trash our
 			 * workspace if we do not do this silly copy.
 			 * Either they are not respecting the len
 			 * value or do not like the alignment.
 			 */
 			if (bufsz > len)
 				bufsz = len;
 			bcopy(buf, pkt, bufsz);
 			return (bufsz);
 		}
 		if (status != EFI_NOT_READY)
 			return (0);
 	}
 
 	return (0);
 }
 
 static void
 efinet_init(struct iodesc *desc, void *machdep_hint)
 {
 	struct netif *nif = desc->io_netif;
 	EFI_SIMPLE_NETWORK *net;
 	EFI_HANDLE h;
 	EFI_STATUS status;
 
 	if (nif->nif_driver->netif_ifs[nif->nif_unit].dif_unit < 0) {
 		printf("Invalid network interface %d\n", nif->nif_unit);
 		return;
 	}
 
 	h = nif->nif_driver->netif_ifs[nif->nif_unit].dif_private;
 	status = BS->HandleProtocol(h, &sn_guid, (VOID **)&nif->nif_devdata);
 	if (status != EFI_SUCCESS) {
 		printf("net%d: cannot fetch interface data (status=%lu)\n",
 		    nif->nif_unit, EFI_ERROR_CODE(status));
 		return;
 	}
 
 	net = nif->nif_devdata;
 	if (net->Mode->State == EfiSimpleNetworkStopped) {
 		status = net->Start(net);
 		if (status != EFI_SUCCESS) {
 			printf("net%d: cannot start interface (status=%ld)\n",
 			    nif->nif_unit, (long)status);
 			return;
 		}
 	}
 
 	if (net->Mode->State != EfiSimpleNetworkInitialized) {
 		status = net->Initialize(net, 0, 0);
 		if (status != EFI_SUCCESS) {
 			printf("net%d: cannot init. interface (status=%ld)\n",
 			    nif->nif_unit, (long)status);
 			return;
 		}
 	}
 
 	if (net->Mode->ReceiveFilterSetting == 0) {
 		UINT32 mask = EFI_SIMPLE_NETWORK_RECEIVE_UNICAST |
 		    EFI_SIMPLE_NETWORK_RECEIVE_BROADCAST;
 
 		status = net->ReceiveFilters(net, mask, 0, FALSE, 0, 0);
 		if (status != EFI_SUCCESS) {
 			printf("net%d: cannot set rx. filters (status=%ld)\n",
 			    nif->nif_unit, (long)status);
 			return;
 		}
 	}
 
 #ifdef EFINET_DEBUG
 	dump_mode(net->Mode);
 #endif
 
 	bcopy(net->Mode->CurrentAddress.Addr, desc->myea, 6);
 	desc->xid = 1;
 }
 
 static void
 efinet_end(struct netif *nif)
 {
 	EFI_SIMPLE_NETWORK *net = nif->nif_devdata; 
 
 	if (net == NULL)
 		return;
 
 	net->Shutdown(net);
 }
 
 static int efinet_dev_init(void);
 static int efinet_dev_print(int);
 
 struct devsw efinet_dev = {
 	.dv_name = "net",
 	.dv_type = DEVT_NET,
 	.dv_init = efinet_dev_init,
 	.dv_strategy = net_strategy,
 	.dv_open = net_open,
 	.dv_close = net_close,
 	.dv_ioctl = noioctl,
 	.dv_print = efinet_dev_print,
 	.dv_cleanup = NULL
 };
 
 static int
 efinet_dev_init()
 {
 	struct netif_dif *dif;
 	struct netif_stats *stats;
 	EFI_DEVICE_PATH *devpath, *node;
 	EFI_SIMPLE_NETWORK *net;
 	EFI_HANDLE *handles, *handles2;
 	EFI_STATUS status;
 	UINTN sz;
 	int err, i, nifs;
 
 	sz = 0;
 	handles = NULL;
 	status = BS->LocateHandle(ByProtocol, &sn_guid, 0, &sz, 0);
 	if (status == EFI_BUFFER_TOO_SMALL) {
 		handles = (EFI_HANDLE *)malloc(sz);
 		status = BS->LocateHandle(ByProtocol, &sn_guid, 0, &sz,
 		    handles);
 		if (EFI_ERROR(status))
 			free(handles);
 	}
 	if (EFI_ERROR(status))
 		return (efi_status_to_errno(status));
 	handles2 = (EFI_HANDLE *)malloc(sz);
 	if (handles2 == NULL) {
 		free(handles);
 		return (ENOMEM);
 	}
 	nifs = 0;
 	for (i = 0; i < sz / sizeof(EFI_HANDLE); i++) {
 		devpath = efi_lookup_devpath(handles[i]);
 		if (devpath == NULL)
 			continue;
 		if ((node = efi_devpath_last_node(devpath)) == NULL)
 			continue;
 
 		if (DevicePathType(node) != MESSAGING_DEVICE_PATH ||
 		    DevicePathSubType(node) != MSG_MAC_ADDR_DP)
 			continue;
 
 		/*
 		 * Open the network device in exclusive mode. Without this
 		 * we will be racing with the UEFI network stack. It will
 		 * pull packets off the network leading to lost packets.
 		 */
 		status = BS->OpenProtocol(handles[i], &sn_guid, (void **)&net,
 		    IH, 0, EFI_OPEN_PROTOCOL_EXCLUSIVE);
 		if (status != EFI_SUCCESS) {
 			printf("Unable to open network interface %d for "
 			    "exclusive access: %d\n", i, EFI_ERROR(status));
 		}
 
 		handles2[nifs] = handles[i];
 		nifs++;
 	}
 	free(handles);
 	if (nifs == 0) {
 		err = ENOENT;
 		goto done;
 	}
 
 	err = efi_register_handles(&efinet_dev, handles2, NULL, nifs);
 	if (err != 0)
 		goto done;
 
 	efinetif.netif_ifs = calloc(nifs, sizeof(struct netif_dif));
 	stats = calloc(nifs, sizeof(struct netif_stats));
 	if (efinetif.netif_ifs == NULL || stats == NULL) {
 		free(efinetif.netif_ifs);
 		free(stats);
 		efinetif.netif_ifs = NULL;
 		err = ENOMEM;
 		goto done;
 	}
 	efinetif.netif_nifs = nifs;
 
 	for (i = 0; i < nifs; i++) {
 
 		dif = &efinetif.netif_ifs[i];
 		dif->dif_unit = i;
 		dif->dif_nsel = 1;
 		dif->dif_stats = &stats[i];
 		dif->dif_private = handles2[i];
 	}
 done:
 	free(handles2);
 	return (err);
 }
 
 static int
 efinet_dev_print(int verbose)
 {
 	CHAR16 *text;
 	EFI_HANDLE h;
 	int unit, ret = 0;
 
 	printf("%s devices:", efinet_dev.dv_name);
 	if ((ret = pager_output("\n")) != 0)
 		return (ret);
 
 	for (unit = 0, h = efi_find_handle(&efinet_dev, 0);
 	    h != NULL; h = efi_find_handle(&efinet_dev, ++unit)) {
 		printf("    %s%d:", efinet_dev.dv_name, unit);
 		if (verbose) {
 			text = efi_devpath_name(efi_lookup_devpath(h));
 			if (text != NULL) {
 				printf("    %S", text);
 				efi_free_devpath_name(text);
 			}
 		}
 		if ((ret = pager_output("\n")) != 0)
 			break;
 	}
 	return (ret);
 }
Index: head/sys/boot/fdt/fdt_overlay.c
===================================================================
--- head/sys/boot/fdt/fdt_overlay.c	(revision 314067)
+++ head/sys/boot/fdt/fdt_overlay.c	(revision 314068)
@@ -1,438 +1,438 @@
 /*-
  * Copyright (c) 2015 Oleksandr Tymoshenko <gonzo@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <stand.h>
 #include <libfdt.h>
 
 #include "fdt_overlay.h"
 
 /*
  * Get max phandle
  */
 static uint32_t
 fdt_max_phandle(void *fdtp)
 {
 	int o, depth;
 	uint32_t max_phandle, phandle;
 
 	depth = 1;
 	o = fdt_path_offset(fdtp, "/");
 	max_phandle = fdt_get_phandle(fdtp, o);
 	for (depth = 0; (o >= 0) && (depth >= 0); o = fdt_next_node(fdtp, o, &depth)) {
 		phandle = fdt_get_phandle(fdtp, o);
 		if (max_phandle < phandle)
 			max_phandle = phandle;
 	}
 
 	return max_phandle;
 }
 
 /*
  * Returns exact memory location specified by fixup in format
  * /path/to/node:property:offset
  */
 static void *
 fdt_get_fixup_location(void *fdtp, const char *fixup)
 {
 	char *path, *prop, *offsetp, *endp;
 	int prop_offset, o, proplen;
 	void  *result;
 
-	result = 0;
+	result = NULL;
 
 	path = strdup(fixup);
 	prop = strchr(path, ':');
 	if (prop == NULL) {
 		printf("missing property part in \"%s\"\n", fixup);
 		result = NULL;
 		goto out;
 	}
 
 	*prop = 0;
 	prop++;
 
 	offsetp = strchr(prop, ':');
 	if (offsetp == NULL) {
 		printf("missing offset part in \"%s\"\n", fixup);
 		result = NULL;
 		goto out;
 	}
 
 	*offsetp = 0;
 	offsetp++;
 
 	prop_offset = strtoul(offsetp, &endp, 10);
 	if (*endp != '\0') {
 		printf("\"%s\" is not valid number\n", offsetp);
 		result = NULL;
 		goto out;
 	}
 
 	o = fdt_path_offset(fdtp, path);
 	if (o < 0) {
 		printf("path \"%s\" not found\n", path);
 		result = NULL;
 		goto out;
 	}
 
 	result = fdt_getprop_w(fdtp, o, prop, &proplen);
 	if (result == NULL){
 		printf("property \"%s\" not found in  \"%s\" node\n", prop, path);
 		result = NULL;
 		goto out;
 	}
 
 	if (proplen < prop_offset + sizeof(uint32_t)) {
 		printf("%s: property length is too small for fixup\n", fixup);
 		result = NULL;
 		goto out;
 	}
 
 	result = (char*)result + prop_offset;
 
 out:
 	free(path);
 	return (result);
 }
 
 /*
  * Process one entry in __fixups__ { } node
  * @fixups is property value, array of NUL-terminated strings
  *   with fixup locations
  * @fixups_len length of the fixups array in bytes
  * @phandle is value for these locations
  */
 static int
 fdt_do_one_fixup(void *fdtp, const char *fixups, int fixups_len, int phandle)
 {
 	void *fixup_pos;
 	uint32_t val;
 
 	val = cpu_to_fdt32(phandle);
 
 	while (fixups_len > 0) {
 		fixup_pos = fdt_get_fixup_location(fdtp, fixups);
 		if (fixup_pos != NULL)
 			memcpy(fixup_pos, &val, sizeof(val));
 
 		fixups_len -= strlen(fixups) + 1;
 		fixups += strlen(fixups) + 1;
 	}
 
 	return (0);
 }
 
 /*
  * Increase u32 value at pos by offset
  */
 static void
 fdt_increase_u32(void *pos, uint32_t offset)
 {
 	uint32_t val;
 
 	memcpy(&val, pos,  sizeof(val));
 	val = cpu_to_fdt32(fdt32_to_cpu(val) + offset);
 	memcpy(pos, &val, sizeof(val));
 }
 
 /*
  * Process local fixups
  * @fixups is property value, array of NUL-terminated strings
  *   with fixup locations
  * @fixups_len length of the fixups array in bytes
  * @offset value these locations should be increased by
  */
 static int
 fdt_do_local_fixup(void *fdtp, const char *fixups, int fixups_len, int offset)
 {
 	void *fixup_pos;
 
 	while (fixups_len > 0) {
 		fixup_pos = fdt_get_fixup_location(fdtp, fixups);
 		if (fixup_pos != NULL)
 			fdt_increase_u32(fixup_pos, offset);
 
 		fixups_len -= strlen(fixups) + 1;
 		fixups += strlen(fixups) + 1;
 	}
 
 	return (0);
 }
 
 /*
  * Increase node phandle by phandle_offset
  */
 static void
 fdt_increase_phandle(void *fdtp, int node_offset, uint32_t phandle_offset)
 {
 	int proplen;
 	void *phandle_pos, *node_pos;
 
 	node_pos = (char*)fdtp + node_offset;
 
 	phandle_pos = fdt_getprop_w(fdtp, node_offset, "phandle", &proplen);
 	if (phandle_pos)
 		fdt_increase_u32(phandle_pos, phandle_offset);
 	phandle_pos = fdt_getprop_w(fdtp, node_offset, "linux,phandle", &proplen);
 	if (phandle_pos)
 		fdt_increase_u32(phandle_pos, phandle_offset);
 }
 
 /*
  * Increase all phandles by offset
  */
 static void
 fdt_increase_phandles(void *fdtp, uint32_t offset)
 {
 	int o, depth;
 
 	o = fdt_path_offset(fdtp, "/");
 	for (depth = 0; (o >= 0) && (depth >= 0); o = fdt_next_node(fdtp, o, &depth)) {
 		fdt_increase_phandle(fdtp, o, offset);
 	}
 }
 
 /*
  * Overlay one node defined by <overlay_fdtp, overlay_o> over <main_fdtp, target_o>
  */
 static void
 fdt_overlay_node(void *main_fdtp, int target_o, void *overlay_fdtp, int overlay_o)
 {
 	int len, o, depth;
 	const char *name;
 	const void *val;
 	int target_subnode_o;
 
 	/* Overlay properties */
 	for (o = fdt_first_property_offset(overlay_fdtp, overlay_o);
 	    o >= 0; o = fdt_next_property_offset(overlay_fdtp, o)) {
 		val = fdt_getprop_by_offset(overlay_fdtp, o, &name, &len);
 		if (val)
 			fdt_setprop(main_fdtp, target_o, name, val, len);
 	}
 
 	/* Now overlay nodes */
 	o = overlay_o;
         for (depth = 0; (o >= 0) && (depth >= 0);
 	    o = fdt_next_node(overlay_fdtp, o, &depth)) {
 		if (depth != 1)
 			continue;
 		/* Check if there is node with the same name */
 		name = fdt_get_name(overlay_fdtp, o, NULL);
 		target_subnode_o = fdt_subnode_offset(main_fdtp, target_o, name);
 		if (target_subnode_o < 0) {
 			/* create new subnode and run merge recursively */
 			target_subnode_o = fdt_add_subnode(main_fdtp, target_o, name);
 			if (target_subnode_o < 0) {
 				printf("failed to create subnode \"%s\": %d\n",
 				    name, target_subnode_o);
 				return;
 			}
 		}
 
 		fdt_overlay_node(main_fdtp, target_subnode_o,
 		    overlay_fdtp, o);
 	}
 }
 
 /*
  * Apply one overlay fragment
  */
 static void
 fdt_apply_fragment(void *main_fdtp, void *overlay_fdtp, int fragment_o)
 {
 	uint32_t target;
 	const char *target_path;
 	const void *val;
 	int target_node_o, overlay_node_o;
 
 	target_node_o = -1;
 	val = fdt_getprop(overlay_fdtp, fragment_o, "target", NULL);
 	if (val) {
 		memcpy(&target, val, sizeof(target));
 		target = fdt32_to_cpu(target);
 		target_node_o = fdt_node_offset_by_phandle(main_fdtp, target);
 		if (target_node_o < 0) {
 			printf("failed to find target %04x\n", target);
 			return;
 		}
 	}
 
 	if (target_node_o < 0) {
 		target_path = fdt_getprop(overlay_fdtp, fragment_o, "target-path", NULL);
 		if (target_path == NULL)
 			return;
 
 		target_node_o = fdt_path_offset(main_fdtp, target_path);
 		if (target_node_o < 0) {
 			printf("failed to find target-path %s\n", target_path);
 			return;
 		}
 	}
 
 	if (target_node_o < 0)
 		return;
 
 	overlay_node_o = fdt_subnode_offset(overlay_fdtp, fragment_o, "__overlay__");
 	if (overlay_node_o < 0) {
 		printf("missing __overlay__ sub-node\n");
 		return;
 	}
 
 	fdt_overlay_node(main_fdtp, target_node_o, overlay_fdtp, overlay_node_o);
 }
 
 /*
  * Handle __fixups__ node in overlay DTB
  */
 static int
 fdt_overlay_do_fixups(void *main_fdtp, void *overlay_fdtp)
 {
 	int main_symbols_o, symbol_o, overlay_fixups_o;
 	int fixup_prop_o;
 	int len;
 	const char *fixups, *name;
 	const char *symbol_path;
 	uint32_t phandle;
 
 	main_symbols_o = fdt_path_offset(main_fdtp, "/__symbols__");
 	overlay_fixups_o = fdt_path_offset(overlay_fdtp, "/__fixups__");
 
 	if (main_symbols_o < 0)
 		return (-1);
 	if (overlay_fixups_o < 0)
 		return (-1);
 
 	for (fixup_prop_o = fdt_first_property_offset(overlay_fdtp, overlay_fixups_o);
 	    fixup_prop_o >= 0;
 	    fixup_prop_o = fdt_next_property_offset(overlay_fdtp, fixup_prop_o)) {
 		fixups = fdt_getprop_by_offset(overlay_fdtp, fixup_prop_o, &name, &len);
 		symbol_path = fdt_getprop(main_fdtp, main_symbols_o, name, NULL);
 		if (symbol_path == NULL) {
 			printf("couldn't find \"%s\" symbol in main dtb\n", name);
 			return (-1);
 		}
 		symbol_o = fdt_path_offset(main_fdtp, symbol_path);
 		if (symbol_o < 0) {
 			printf("couldn't find \"%s\" path in main dtb\n", symbol_path);
 			return (-1);
 		}
 		phandle = fdt_get_phandle(main_fdtp, symbol_o);
 		if (fdt_do_one_fixup(overlay_fdtp, fixups, len, phandle) < 0)
 			return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Handle __local_fixups__ node in overlay DTB
  */
 static int
 fdt_overlay_do_local_fixups(void *main_fdtp, void *overlay_fdtp)
 {
 	int overlay_local_fixups_o;
 	int len;
 	const char *fixups;
 	uint32_t phandle_offset;
 
 	overlay_local_fixups_o = fdt_path_offset(overlay_fdtp, "/__local_fixups__");
 
 	if (overlay_local_fixups_o < 0)
 		return (-1);
 
 	phandle_offset = fdt_max_phandle(main_fdtp);
 	fdt_increase_phandles(overlay_fdtp, phandle_offset);
 	fixups = fdt_getprop_w(overlay_fdtp, overlay_local_fixups_o, "fixup", &len);
 	if (fixups) {
 		if (fdt_do_local_fixup(overlay_fdtp, fixups, len, phandle_offset) < 0)
 			return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Apply all fragments to main DTB
  */
 static int
 fdt_overlay_apply_fragments(void *main_fdtp, void *overlay_fdtp)
 {
 	int o, depth;
 
 	o = fdt_path_offset(overlay_fdtp, "/");
 	for (depth = 0; (o >= 0) && (depth >= 0); o = fdt_next_node(overlay_fdtp, o, &depth)) {
 		if (depth != 1)
 			continue;
 
 		fdt_apply_fragment(main_fdtp, overlay_fdtp, o);
 	}
 
 	return (0);
 }
 
 int
 fdt_overlay_apply(void *main_fdtp, void *overlay_fdtp, size_t overlay_length)
 {
 	void *overlay_copy;
 	int rv;
 
 	rv = 0;
 
 	/* We modify overlay in-place, so we need writable copy */
 	overlay_copy = malloc(overlay_length);
 	if (overlay_copy == NULL) {
 		printf("failed to allocate memory for overlay copy\n");
 		return (-1);
 	}
 
 	memcpy(overlay_copy, overlay_fdtp, overlay_length);
 
 	if (fdt_overlay_do_fixups(main_fdtp, overlay_copy) < 0) {
 		printf("failed to perform fixups in overlay\n");
 		rv = -1;
 		goto out;
 	}
 
 	if (fdt_overlay_do_local_fixups(main_fdtp, overlay_copy) < 0) {
 		printf("failed to perform local fixups in overlay\n");
 		rv = -1;
 		goto out;
 	}
 
 	if (fdt_overlay_apply_fragments(main_fdtp, overlay_copy) < 0) {
 		printf("failed to apply fragments\n");
 		rv = -1;
 	}
 
 out:
 	free(overlay_copy);
 
 	return (rv);
 }
Index: head/sys/boot/ficl/ficl.c
===================================================================
--- head/sys/boot/ficl/ficl.c	(revision 314067)
+++ head/sys/boot/ficl/ficl.c	(revision 314068)
@@ -1,696 +1,696 @@
 /*******************************************************************
 ** f i c l . c
 ** Forth Inspired Command Language - external interface
 ** Author: John Sadler (john_sadler@alum.mit.edu)
 ** Created: 19 July 1997
 ** $Id: ficl.c,v 1.16 2001/12/05 07:21:34 jsadler Exp $
 *******************************************************************/
 /*
 ** This is an ANS Forth interpreter written in C.
 ** Ficl uses Forth syntax for its commands, but turns the Forth 
 ** model on its head in other respects.
 ** Ficl provides facilities for interoperating
 ** with programs written in C: C functions can be exported to Ficl,
 ** and Ficl commands can be executed via a C calling interface. The
 ** interpreter is re-entrant, so it can be used in multiple instances
 ** in a multitasking system. Unlike Forth, Ficl's outer interpreter
 ** expects a text block as input, and returns to the caller after each
 ** text block, so the data pump is somewhere in external code in the 
 ** style of TCL.
 **
 ** Code is written in ANSI C for portability. 
 */
 /*
 ** Copyright (c) 1997-2001 John Sadler (john_sadler@alum.mit.edu)
 ** All rights reserved.
 **
 ** Get the latest Ficl release at http://ficl.sourceforge.net
 **
 ** I am interested in hearing from anyone who uses ficl. If you have
 ** a problem, a success story, a defect, an enhancement request, or
 ** if you would like to contribute to the ficl release, please
 ** contact me by email at the address above.
 **
 ** L I C E N S E  and  D I S C L A I M E R
 ** 
 ** Redistribution and use in source and binary forms, with or without
 ** modification, are permitted provided that the following conditions
 ** are met:
 ** 1. Redistributions of source code must retain the above copyright
 **    notice, this list of conditions and the following disclaimer.
 ** 2. Redistributions in binary form must reproduce the above copyright
 **    notice, this list of conditions and the following disclaimer in the
 **    documentation and/or other materials provided with the distribution.
 **
 ** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 ** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ** ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 ** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 ** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 ** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 ** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 ** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 ** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 ** SUCH DAMAGE.
 */
 
 /* $FreeBSD$ */
 
 #ifdef TESTMAIN
 #include <stdlib.h>
 #else
 #include <stand.h>
 #endif
 #include <string.h>
 #include "ficl.h"
 
 
 /*
 ** System statics
 ** Each FICL_SYSTEM builds a global dictionary during its start
 ** sequence. This is shared by all virtual machines of that system.
 ** Therefore only one VM can update the dictionary
 ** at a time. The system imports a locking function that
 ** you can override in order to control update access to
 ** the dictionary. The function is stubbed out by default,
 ** but you can insert one: #define FICL_MULTITHREAD 1
 ** and supply your own version of ficlLockDictionary.
 */
 static int defaultStack = FICL_DEFAULT_STACK;
 
 
 static void ficlSetVersionEnv(FICL_SYSTEM *pSys);
 
 
 /**************************************************************************
                         f i c l I n i t S y s t e m
 ** Binds a global dictionary to the interpreter system. 
 ** You specify the address and size of the allocated area.
 ** After that, ficl manages it.
 ** First step is to set up the static pointers to the area.
 ** Then write the "precompiled" portion of the dictionary in.
 ** The dictionary needs to be at least large enough to hold the
 ** precompiled part. Try 1K cells minimum. Use "words" to find
 ** out how much of the dictionary is used at any time.
 **************************************************************************/
 FICL_SYSTEM *ficlInitSystemEx(FICL_SYSTEM_INFO *fsi)
 {
     int nDictCells;
     int nEnvCells;
     FICL_SYSTEM *pSys = ficlMalloc(sizeof (FICL_SYSTEM));
 
     assert(pSys);
     assert(fsi->size == sizeof (FICL_SYSTEM_INFO));
 
     memset(pSys, 0, sizeof (FICL_SYSTEM));
 
     nDictCells = fsi->nDictCells;
     if (nDictCells <= 0)
         nDictCells = FICL_DEFAULT_DICT;
 
     nEnvCells = fsi->nEnvCells;
     if (nEnvCells <= 0)
         nEnvCells = FICL_DEFAULT_DICT;
 
     pSys->dp = dictCreateHashed((unsigned)nDictCells, HASHSIZE);
     pSys->dp->pForthWords->name = "forth-wordlist";
 
     pSys->envp = dictCreate((unsigned)nEnvCells);
     pSys->envp->pForthWords->name = "environment";
 
     pSys->textOut = fsi->textOut;
     pSys->pExtend = fsi->pExtend;
 
 #if FICL_WANT_LOCALS
     /*
     ** The locals dictionary is only searched while compiling,
     ** but this is where speed is most important. On the other
     ** hand, the dictionary gets emptied after each use of locals
     ** The need to balance search speed with the cost of the 'empty'
     ** operation led me to select a single-threaded list...
     */
     pSys->localp = dictCreate((unsigned)FICL_MAX_LOCALS * CELLS_PER_WORD);
 #endif
 
     /*
     ** Build the precompiled dictionary and load softwords. We need a temporary
     ** VM to do this - ficlNewVM links one to the head of the system VM list.
     ** ficlCompilePlatform (defined in win32.c, for example) adds platform specific words.
     */
     ficlCompileCore(pSys);
     ficlCompilePrefix(pSys);
 #if FICL_WANT_FLOAT
     ficlCompileFloat(pSys);
 #endif
 #if FICL_PLATFORM_EXTEND
     ficlCompilePlatform(pSys);
 #endif
     ficlSetVersionEnv(pSys);
 
     /*
     ** Establish the parse order. Note that prefixes precede numbers -
     ** this allows constructs like "0b101010" which might parse as a
     ** hex value otherwise.
     */
     ficlAddPrecompiledParseStep(pSys, "?prefix", ficlParsePrefix);
     ficlAddPrecompiledParseStep(pSys, "?number", ficlParseNumber);
 #if FICL_WANT_FLOAT
     ficlAddPrecompiledParseStep(pSys, ">float", ficlParseFloatNumber);
 #endif
 
     /*
     ** Now create a temporary VM to compile the softwords. Since all VMs are
     ** linked into the vmList of FICL_SYSTEM, we don't have to pass the VM
     ** to ficlCompileSoftCore -- it just hijacks whatever it finds in the VM list.
     ** ficl 2.05: vmCreate no longer depends on the presence of INTERPRET in the
     ** dictionary, so a VM can be created before the dictionary is built. It just
     ** can't do much...
     */
     ficlNewVM(pSys);
     ficlCompileSoftCore(pSys);
     ficlFreeVM(pSys->vmList);
 
 
     return pSys;
 }
 
 
 FICL_SYSTEM *ficlInitSystem(int nDictCells)
 {
     FICL_SYSTEM_INFO fsi;
     ficlInitInfo(&fsi);
     fsi.nDictCells = nDictCells;
     return ficlInitSystemEx(&fsi);
 }
 
 
 /**************************************************************************
                         f i c l A d d P a r s e S t e p
 ** Appends a parse step function to the end of the parse list (see 
 ** FICL_PARSE_STEP notes in ficl.h for details). Returns 0 if successful,
 ** nonzero if there's no more room in the list.
 **************************************************************************/
 int ficlAddParseStep(FICL_SYSTEM *pSys, FICL_WORD *pFW)
 {
     int i;
     for (i = 0; i < FICL_MAX_PARSE_STEPS; i++)
     {
         if (pSys->parseList[i] == NULL)
         {
             pSys->parseList[i] = pFW;
             return 0;
         }
     }
 
     return 1;
 }
 
 
 /*
 ** Compile a word into the dictionary that invokes the specified FICL_PARSE_STEP
 ** function. It is up to the user (as usual in Forth) to make sure the stack 
 ** preconditions are valid (there needs to be a counted string on top of the stack)
 ** before using the resulting word.
 */
 void ficlAddPrecompiledParseStep(FICL_SYSTEM *pSys, char *name, FICL_PARSE_STEP pStep)
 {
     FICL_DICT *dp = pSys->dp;
     FICL_WORD *pFW = dictAppendWord(dp, name, parseStepParen, FW_DEFAULT);
     dictAppendCell(dp, LVALUEtoCELL(pStep));
     ficlAddParseStep(pSys, pFW);
 }
 
 
 /*
 ** This word lists the parse steps in order
 */
 void ficlListParseSteps(FICL_VM *pVM)
 {
     int i;
     FICL_SYSTEM *pSys = pVM->pSys;
     assert(pSys);
 
     vmTextOut(pVM, "Parse steps:", 1);
     vmTextOut(pVM, "lookup", 1);
 
     for (i = 0; i < FICL_MAX_PARSE_STEPS; i++)
     {
         if (pSys->parseList[i] != NULL)
         {
             vmTextOut(pVM, pSys->parseList[i]->name, 1);
         }
         else break;
     }
     return;
 }
 
 
 /**************************************************************************
                         f i c l N e w V M
 ** Create a new virtual machine and link it into the system list
 ** of VMs for later cleanup by ficlTermSystem.
 **************************************************************************/
 FICL_VM *ficlNewVM(FICL_SYSTEM *pSys)
 {
     FICL_VM *pVM = vmCreate(NULL, defaultStack, defaultStack);
     pVM->link = pSys->vmList;
     pVM->pSys = pSys;
     pVM->pExtend = pSys->pExtend;
     vmSetTextOut(pVM, pSys->textOut);
 
     pSys->vmList = pVM;
     return pVM;
 }
 
 
 /**************************************************************************
                         f i c l F r e e V M
 ** Removes the VM in question from the system VM list and deletes the
 ** memory allocated to it. This is an optional call, since ficlTermSystem
 ** will do this cleanup for you. This function is handy if you're going to
 ** do a lot of dynamic creation of VMs.
 **************************************************************************/
 void ficlFreeVM(FICL_VM *pVM)
 {
     FICL_SYSTEM *pSys = pVM->pSys;
     FICL_VM *pList = pSys->vmList;
 
-    assert(pVM != 0);
+    assert(pVM != NULL);
 
     if (pSys->vmList == pVM)
     {
         pSys->vmList = pSys->vmList->link;
     }
     else for (; pList != NULL; pList = pList->link)
     {
         if (pList->link == pVM)
         {
             pList->link = pVM->link;
             break;
         }
     }
 
     if (pList)
         vmDelete(pVM);
     return;
 }
 
 
 /**************************************************************************
                         f i c l B u i l d
 ** Builds a word into the dictionary.
 ** Preconditions: system must be initialized, and there must
 ** be enough space for the new word's header! Operation is
 ** controlled by ficlLockDictionary, so any initialization
 ** required by your version of the function (if you overrode
 ** it) must be complete at this point.
 ** Parameters:
 ** name  -- duh, the name of the word
 ** code  -- code to execute when the word is invoked - must take a single param
 **          pointer to a FICL_VM
 ** flags -- 0 or more of F_IMMEDIATE, F_COMPILE, use bitwise OR!
 ** 
 **************************************************************************/
 int ficlBuild(FICL_SYSTEM *pSys, char *name, FICL_CODE code, char flags)
 {
 #if FICL_MULTITHREAD
     int err = ficlLockDictionary(TRUE);
     if (err) return err;
 #endif /* FICL_MULTITHREAD */
 
     assert(dictCellsAvail(pSys->dp) > sizeof (FICL_WORD) / sizeof (CELL));
     dictAppendWord(pSys->dp, name, code, flags);
 
     ficlLockDictionary(FALSE);
     return 0;
 }
 
 
 /**************************************************************************
                     f i c l E v a l u a t e
 ** Wrapper for ficlExec() which sets SOURCE-ID to -1.
 **************************************************************************/
 int ficlEvaluate(FICL_VM *pVM, char *pText)
 {
     int returnValue;
     CELL id = pVM->sourceID;
     pVM->sourceID.i = -1;
     returnValue = ficlExecC(pVM, pText, -1);
     pVM->sourceID = id;
     return returnValue;
 }
 
 
 /**************************************************************************
                         f i c l E x e c
 ** Evaluates a block of input text in the context of the
 ** specified interpreter. Emits any requested output to the
 ** interpreter's output function.
 **
 ** Contains the "inner interpreter" code in a tight loop
 **
 ** Returns one of the VM_XXXX codes defined in ficl.h:
 ** VM_OUTOFTEXT is the normal exit condition
 ** VM_ERREXIT means that the interp encountered a syntax error
 **      and the vm has been reset to recover (some or all
 **      of the text block got ignored
 ** VM_USEREXIT means that the user executed the "bye" command
 **      to shut down the interpreter. This would be a good
 **      time to delete the vm, etc -- or you can ignore this
 **      signal.
 **************************************************************************/
 int ficlExec(FICL_VM *pVM, char *pText)
 {
     return ficlExecC(pVM, pText, -1);
 }
 
 int ficlExecC(FICL_VM *pVM, char *pText, FICL_INT size)
 {
     FICL_SYSTEM *pSys = pVM->pSys;
     FICL_DICT   *dp   = pSys->dp;
 
     int        except;
     jmp_buf    vmState;
     jmp_buf   *oldState;
     TIB        saveTib;
 
     assert(pVM);
     assert(pSys->pInterp[0]);
 
     if (size < 0)
         size = strlen(pText);
 
     vmPushTib(pVM, pText, size, &saveTib);
 
     /*
     ** Save and restore VM's jmp_buf to enable nested calls to ficlExec 
     */
     oldState = pVM->pState;
     pVM->pState = &vmState; /* This has to come before the setjmp! */
     except = setjmp(vmState);
 
     switch (except)
     {
     case 0:
         if (pVM->fRestart)
         {
             pVM->runningWord->code(pVM);
             pVM->fRestart = 0;
         }
         else
         {   /* set VM up to interpret text */
             vmPushIP(pVM, &(pSys->pInterp[0]));
         }
 
         vmInnerLoop(pVM);
         break;
 
     case VM_RESTART:
         pVM->fRestart = 1;
         except = VM_OUTOFTEXT;
         break;
 
     case VM_OUTOFTEXT:
         vmPopIP(pVM);
 #ifdef TESTMAIN
         if ((pVM->state != COMPILE) && (pVM->sourceID.i == 0))
             ficlTextOut(pVM, FICL_PROMPT, 0);
 #endif
         break;
 
     case VM_USEREXIT:
     case VM_INNEREXIT:
     case VM_BREAK:
         break;
 
     case VM_QUIT:
         if (pVM->state == COMPILE)
         {
             dictAbortDefinition(dp);
 #if FICL_WANT_LOCALS
             dictEmpty(pSys->localp, pSys->localp->pForthWords->size);
 #endif
         }
         vmQuit(pVM);
         break;
 
     case VM_ERREXIT:
     case VM_ABORT:
     case VM_ABORTQ:
     default:    /* user defined exit code?? */
         if (pVM->state == COMPILE)
         {
             dictAbortDefinition(dp);
 #if FICL_WANT_LOCALS
             dictEmpty(pSys->localp, pSys->localp->pForthWords->size);
 #endif
         }
         dictResetSearchOrder(dp);
         vmReset(pVM);
         break;
    }
 
     pVM->pState    = oldState;
     vmPopTib(pVM, &saveTib);
     return (except);
 }
 
 
 /**************************************************************************
                         f i c l E x e c X T
 ** Given a pointer to a FICL_WORD, push an inner interpreter and
 ** execute the word to completion. This is in contrast with vmExecute,
 ** which does not guarantee that the word will have completed when
 ** the function returns (ie in the case of colon definitions, which
 ** need an inner interpreter to finish)
 **
 ** Returns one of the VM_XXXX exception codes listed in ficl.h. Normal
 ** exit condition is VM_INNEREXIT, ficl's private signal to exit the
 ** inner loop under normal circumstances. If another code is thrown to
 ** exit the loop, this function will re-throw it if it's nested under
 ** itself or ficlExec.
 **
 ** NOTE: this function is intended so that C code can execute ficlWords
 ** given their address in the dictionary (xt).
 **************************************************************************/
 int ficlExecXT(FICL_VM *pVM, FICL_WORD *pWord)
 {
     int        except;
     jmp_buf    vmState;
     jmp_buf   *oldState;
     FICL_WORD *oldRunningWord;
 
     assert(pVM);
     assert(pVM->pSys->pExitInner);
     
     /* 
     ** Save the runningword so that RESTART behaves correctly
     ** over nested calls.
     */
     oldRunningWord = pVM->runningWord;
     /*
     ** Save and restore VM's jmp_buf to enable nested calls
     */
     oldState = pVM->pState;
     pVM->pState = &vmState; /* This has to come before the setjmp! */
     except = setjmp(vmState);
 
     if (except)
         vmPopIP(pVM);
     else
         vmPushIP(pVM, &(pVM->pSys->pExitInner));
 
     switch (except)
     {
     case 0:
         vmExecute(pVM, pWord);
         vmInnerLoop(pVM);
         break;
 
     case VM_INNEREXIT:
     case VM_BREAK:
         break;
 
     case VM_RESTART:
     case VM_OUTOFTEXT:
     case VM_USEREXIT:
     case VM_QUIT:
     case VM_ERREXIT:
     case VM_ABORT:
     case VM_ABORTQ:
     default:    /* user defined exit code?? */
         if (oldState)
         {
             pVM->pState = oldState;
             vmThrow(pVM, except);
         }
         break;
     }
 
     pVM->pState    = oldState;
     pVM->runningWord = oldRunningWord;
     return (except);
 }
 
 
 /**************************************************************************
                         f i c l L o o k u p
 ** Look in the system dictionary for a match to the given name. If
 ** found, return the address of the corresponding FICL_WORD. Otherwise
 ** return NULL.
 **************************************************************************/
 FICL_WORD *ficlLookup(FICL_SYSTEM *pSys, char *name)
 {
     STRINGINFO si;
     SI_PSZ(si, name);
     return dictLookup(pSys->dp, si);
 }
 
 
 /**************************************************************************
                         f i c l G e t D i c t
 ** Returns the address of the system dictionary
 **************************************************************************/
 FICL_DICT *ficlGetDict(FICL_SYSTEM *pSys)
 {
     return pSys->dp;
 }
 
 
 /**************************************************************************
                         f i c l G e t E n v
 ** Returns the address of the system environment space
 **************************************************************************/
 FICL_DICT *ficlGetEnv(FICL_SYSTEM *pSys)
 {
     return pSys->envp;
 }
 
 
 /**************************************************************************
                         f i c l S e t E n v
 ** Create an environment variable with a one-CELL payload. ficlSetEnvD
 ** makes one with a two-CELL payload.
 **************************************************************************/
 void ficlSetEnv(FICL_SYSTEM *pSys, char *name, FICL_UNS value)
 {
     STRINGINFO si;
     FICL_WORD *pFW;
     FICL_DICT *envp = pSys->envp;
 
     SI_PSZ(si, name);
     pFW = dictLookup(envp, si);
 
     if (pFW == NULL)
     {
         dictAppendWord(envp, name, constantParen, FW_DEFAULT);
         dictAppendCell(envp, LVALUEtoCELL(value));
     }
     else
     {
         pFW->param[0] = LVALUEtoCELL(value);
     }
 
     return;
 }
 
 void ficlSetEnvD(FICL_SYSTEM *pSys, char *name, FICL_UNS hi, FICL_UNS lo)
 {
     FICL_WORD *pFW;
     STRINGINFO si;
     FICL_DICT *envp = pSys->envp;
     SI_PSZ(si, name);
     pFW = dictLookup(envp, si);
 
     if (pFW == NULL)
     {
         dictAppendWord(envp, name, twoConstParen, FW_DEFAULT);
         dictAppendCell(envp, LVALUEtoCELL(lo));
         dictAppendCell(envp, LVALUEtoCELL(hi));
     }
     else
     {
         pFW->param[0] = LVALUEtoCELL(lo);
         pFW->param[1] = LVALUEtoCELL(hi);
     }
 
     return;
 }
 
 
 /**************************************************************************
                         f i c l G e t L o c
 ** Returns the address of the system locals dictionary. This dict is
 ** only used during compilation, and is shared by all VMs.
 **************************************************************************/
 #if FICL_WANT_LOCALS
 FICL_DICT *ficlGetLoc(FICL_SYSTEM *pSys)
 {
     return pSys->localp;
 }
 #endif
 
 
 
 /**************************************************************************
                         f i c l S e t S t a c k S i z e
 ** Set the stack sizes (return and parameter) to be used for all
 ** subsequently created VMs. Returns actual stack size to be used.
 **************************************************************************/
 int ficlSetStackSize(int nStackCells)
 {
     if (nStackCells >= FICL_DEFAULT_STACK)
         defaultStack = nStackCells;
     else
         defaultStack = FICL_DEFAULT_STACK;
 
     return defaultStack;
 }
 
 
 /**************************************************************************
                         f i c l T e r m S y s t e m
 ** Tear the system down by deleting the dictionaries and all VMs.
 ** This saves you from having to keep track of all that stuff.
 **************************************************************************/
 void ficlTermSystem(FICL_SYSTEM *pSys)
 {
     if (pSys->dp)
         dictDelete(pSys->dp);
     pSys->dp = NULL;
 
     if (pSys->envp)
         dictDelete(pSys->envp);
     pSys->envp = NULL;
 
 #if FICL_WANT_LOCALS
     if (pSys->localp)
         dictDelete(pSys->localp);
     pSys->localp = NULL;
 #endif
 
     while (pSys->vmList != NULL)
     {
         FICL_VM *pVM = pSys->vmList;
         pSys->vmList = pSys->vmList->link;
         vmDelete(pVM);
     }
 
     ficlFree(pSys);
     pSys = NULL;
     return;
 }
 
 
 /**************************************************************************
                         f i c l S e t V e r s i o n E n v
 ** Create a double cell environment constant for the version ID
 **************************************************************************/
 static void ficlSetVersionEnv(FICL_SYSTEM *pSys)
 {
     ficlSetEnvD(pSys, "ficl-version", FICL_VER_MAJOR, FICL_VER_MINOR);
     ficlSetEnv (pSys, "ficl-robust",  FICL_ROBUST);
     return;
 }
 
Index: head/sys/boot/kshim/bsd_kernel.c
===================================================================
--- head/sys/boot/kshim/bsd_kernel.c	(revision 314067)
+++ head/sys/boot/kshim/bsd_kernel.c	(revision 314068)
@@ -1,1459 +1,1459 @@
 /* $FreeBSD$ */
 /*-
  * Copyright (c) 2013 Hans Petter Selasky. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <bsd_global.h>
 
 struct usb_process usb_process[USB_PROC_MAX];
 
 static device_t usb_pci_root;
 
 /*------------------------------------------------------------------------*
  * Implementation of mutex API
  *------------------------------------------------------------------------*/
 
 struct mtx Giant;
 int (*bus_alloc_resource_any_cb)(struct resource *res, device_t dev,
     int type, int *rid, unsigned int flags);
 int (*ofw_bus_status_ok_cb)(device_t dev);
 int (*ofw_bus_is_compatible_cb)(device_t dev, char *name);
 
 static void
 mtx_system_init(void *arg)
 {
 	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
 }
 SYSINIT(mtx_system_init, SI_SUB_LOCK, SI_ORDER_MIDDLE, mtx_system_init, NULL);
 
 int
 bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
 		   bus_size_t boundary, bus_addr_t lowaddr,
 		   bus_addr_t highaddr, bus_dma_filter_t *filter,
 		   void *filterarg, bus_size_t maxsize, int nsegments,
 		   bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc,
 		   void *lockfuncarg, bus_dma_tag_t *dmat)
 {
 	struct bus_dma_tag *ret;
 
 	ret = malloc(sizeof(struct bus_dma_tag), XXX, XXX);
 	if (*dmat == NULL)
 		return (ENOMEM);
 	ret->alignment = alignment;
 	ret->maxsize = maxsize;
 
 	*dmat = ret;
 
 	return (0);
 }
 
 int
 bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
     bus_dmamap_t *mapp)
 {
 	void *addr;
 
 	addr = malloc(dmat->maxsize + dmat->alignment, XXX, XXX);
-	if (addr == 0)
+	if (addr == NULL)
 		return (ENOMEM);
 
 	*mapp = addr;
 	addr = (void*)(((uintptr_t)addr + dmat->alignment - 1) & ~(dmat->alignment - 1));
 
 	*vaddr = addr;
 	return (0);
 }
 
 int
 bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
     bus_size_t buflen, bus_dmamap_callback_t *callback,
     void *callback_arg, int flags)
 {
 	bus_dma_segment_t segs[1];
 
 	segs[0].ds_addr = (uintptr_t)buf;
 	segs[0].ds_len = buflen;
 
 	(*callback)(callback_arg, segs, 1, 0);
 
 	return (0);
 }
 
 void
 bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map)
 {
 
 	free(map, XXX);
 }
 
 int
 bus_dma_tag_destroy(bus_dma_tag_t dmat)
 {
 
 	free(dmat, XXX);
 	return (0);
 }
 
 struct resource *
 bus_alloc_resource_any(device_t dev, int type, int *rid, unsigned int flags)
 {
 	struct resource *res;
 	int ret = EINVAL;
 
 	res = malloc(sizeof(*res), XXX, XXX);
 	if (res == NULL)
 		return (NULL);
 
 	res->__r_i = malloc(sizeof(struct resource_i), XXX, XXX);
 	if (res->__r_i == NULL) {
 		free(res, XXX);
 		return (NULL);
 	}
 
 	if (bus_alloc_resource_any_cb != NULL)
 		ret = (*bus_alloc_resource_any_cb)(res, dev, type, rid, flags);
 	if (ret == 0)
 		return (res);
 
 	free(res->__r_i, XXX);
 	free(res, XXX);
 	return (NULL);
 }
 
 int
 bus_alloc_resources(device_t dev, struct resource_spec *rs,
     struct resource **res)
 {
 	int i;
 
 	for (i = 0; rs[i].type != -1; i++)
 		res[i] = NULL;
 	for (i = 0; rs[i].type != -1; i++) {
 		res[i] = bus_alloc_resource_any(dev,
 		    rs[i].type, &rs[i].rid, rs[i].flags);
 		if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
 			bus_release_resources(dev, rs, res);
 			return (ENXIO);
 		}
 	}
 	return (0);
 }
 
 void
 bus_release_resources(device_t dev, const struct resource_spec *rs,
     struct resource **res)
 {
 	int i;
 
 	for (i = 0; rs[i].type != -1; i++)
 		if (res[i] != NULL) {
 			bus_release_resource(
 			    dev, rs[i].type, rs[i].rid, res[i]);
 			res[i] = NULL;
 		}
 }
 
 int
 bus_setup_intr(device_t dev, struct resource *r, int flags,
     driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
 {
 
 	dev->dev_irq_filter = filter;
 	dev->dev_irq_fn = handler;
 	dev->dev_irq_arg = arg;
 
 	return (0);
 }
 
 int
 bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
 {
 
 	dev->dev_irq_filter = NULL;
 	dev->dev_irq_fn = NULL;
 	dev->dev_irq_arg = NULL;
 
 	return (0);
 }
 
 int
 bus_release_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	/* Resource releasing is not supported */
 	return (EINVAL);
 }
 
 int
 bus_generic_attach(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->dev_children, dev_link) {
 		device_probe_and_attach(child);
 	}
 
 	return (0);
 }
 
 bus_space_tag_t
 rman_get_bustag(struct resource *r)
 {
 
 	return (r->r_bustag);
 }
 
 bus_space_handle_t
 rman_get_bushandle(struct resource *r)
 {
 
 	return (r->r_bushandle);
 }
 
 u_long
 rman_get_size(struct resource *r)
 {
 
 	return (r->__r_i->r_end - r->__r_i->r_start + 1);
 }
 
 int
 ofw_bus_status_okay(device_t dev)
 {
 	if (ofw_bus_status_ok_cb == NULL)
 		return (0);
 
 	return ((*ofw_bus_status_ok_cb)(dev));
 }
 
 int
 ofw_bus_is_compatible(device_t dev, char *name)
 {
 	if (ofw_bus_is_compatible_cb == NULL)
 		return (0);
 
 	return ((*ofw_bus_is_compatible_cb)(dev, name));
 }
 
 void
 mtx_init(struct mtx *mtx, const char *name, const char *type, int opt)
 {
 	mtx->owned = 0;
 	mtx->parent = mtx;
 }
 
 void
 mtx_lock(struct mtx *mtx)
 {
 	mtx = mtx->parent;
 	mtx->owned++;
 }
 
 void
 mtx_unlock(struct mtx *mtx)
 {
 	mtx = mtx->parent;
 	mtx->owned--;
 }
 
 int
 mtx_owned(struct mtx *mtx)
 {
 	mtx = mtx->parent;
 	return (mtx->owned != 0);
 }
 
 void
 mtx_destroy(struct mtx *mtx)
 {
 	/* NOP */
 }
 
 /*------------------------------------------------------------------------*
  * Implementation of shared/exclusive mutex API
  *------------------------------------------------------------------------*/
 
 void
 sx_init_flags(struct sx *sx, const char *name, int flags)
 {
 	sx->owned = 0;
 }
 
 void
 sx_destroy(struct sx *sx)
 {
 	/* NOP */
 }
 
 void
 sx_xlock(struct sx *sx)
 {
 	sx->owned++;
 }
 
 void
 sx_xunlock(struct sx *sx)
 {
 	sx->owned--;
 }
 
 int
 sx_xlocked(struct sx *sx)
 {
 	return (sx->owned != 0);
 }
 
 /*------------------------------------------------------------------------*
  * Implementaiton of condition variable API
  *------------------------------------------------------------------------*/
 
 void
 cv_init(struct cv *cv, const char *desc)
 {
 	cv->sleeping = 0;
 }
 
 void
 cv_destroy(struct cv *cv)
 {
 	/* NOP */
 }
 
 void
 cv_wait(struct cv *cv, struct mtx *mtx)
 {
 	cv_timedwait(cv, mtx, -1);
 }
 
 int
 cv_timedwait(struct cv *cv, struct mtx *mtx, int timo)
 {
 	int start = ticks;
 	int delta;
 	int time = 0;
 
 	if (cv->sleeping)
 		return (EWOULDBLOCK);	/* not allowed */
 
 	cv->sleeping = 1;
 
 	while (cv->sleeping) {
 		if (timo >= 0) {
 			delta = ticks - start;
 			if (delta >= timo || delta < 0)
 				break;
 		}
 		mtx_unlock(mtx);
 
 		usb_idle();
 
 		if (++time >= (1000000 / hz)) {
 			time = 0;
 			callout_process(1);
 		}
 
 		/* Sleep for 1 us */
 		delay(1);
 
 		mtx_lock(mtx);
 	}
 
 	if (cv->sleeping) {
 		cv->sleeping = 0;
 		return (EWOULDBLOCK);	/* not allowed */
 	}
 	return (0);
 }
 
 void
 cv_signal(struct cv *cv)
 {
 	cv->sleeping = 0;
 }
 
 void
 cv_broadcast(struct cv *cv)
 {
 	cv->sleeping = 0;
 }
 
 /*------------------------------------------------------------------------*
  * Implementation of callout API
  *------------------------------------------------------------------------*/
 
 static void callout_proc_msg(struct usb_proc_msg *);
 
 volatile int ticks = 0;
 
 static LIST_HEAD(, callout) head_callout = LIST_HEAD_INITIALIZER(&head_callout);
 
 static struct mtx mtx_callout;
 static struct usb_proc_msg callout_msg[2];
 
 static void
 callout_system_init(void *arg)
 {
 	mtx_init(&mtx_callout, "callout-mtx", NULL, MTX_DEF | MTX_RECURSE);
 
 	callout_msg[0].pm_callback = &callout_proc_msg;
 	callout_msg[1].pm_callback = &callout_proc_msg;
 }
 SYSINIT(callout_system_init, SI_SUB_LOCK, SI_ORDER_MIDDLE, callout_system_init, NULL);
 
 static void
 callout_callback(struct callout *c)
 {
 	mtx_lock(c->mtx);
 
 	mtx_lock(&mtx_callout);
 	if (c->entry.le_prev != NULL) {
 		LIST_REMOVE(c, entry);
 		c->entry.le_prev = NULL;
 	}
 	mtx_unlock(&mtx_callout);
 
 	if (c->c_func != NULL)
 		(c->c_func) (c->c_arg);
 
 	if (!(c->flags & CALLOUT_RETURNUNLOCKED))
 		mtx_unlock(c->mtx);
 }
 
 void
 callout_process(int timeout)
 {
 	ticks += timeout;
 	usb_proc_msignal(usb_process + 2, &callout_msg[0], &callout_msg[1]);
 }
 
 static void
 callout_proc_msg(struct usb_proc_msg *pmsg)
 {
 	struct callout *c;
 	int delta;
 
 repeat:
 	mtx_lock(&mtx_callout);
 
 	LIST_FOREACH(c, &head_callout, entry) {
 
 		delta = c->timeout - ticks;
 		if (delta < 0) {
 			mtx_unlock(&mtx_callout);
 
 			callout_callback(c);
 
 			goto repeat;
 		}
 	}
 	mtx_unlock(&mtx_callout);
 }
 
 void
 callout_init_mtx(struct callout *c, struct mtx *mtx, int flags)
 {
 	memset(c, 0, sizeof(*c));
 
 	if (mtx == NULL)
 		mtx = &Giant;
 
 	c->mtx = mtx;
 	c->flags = (flags & CALLOUT_RETURNUNLOCKED);
 }
 
 void
 callout_reset(struct callout *c, int to_ticks,
     void (*func) (void *), void *arg)
 {
 	callout_stop(c);
 
 	c->c_func = func;
 	c->c_arg = arg;
 	c->timeout = ticks + to_ticks;
 
 	mtx_lock(&mtx_callout);
 	LIST_INSERT_HEAD(&head_callout, c, entry);
 	mtx_unlock(&mtx_callout);
 }
 
 void
 callout_stop(struct callout *c)
 {
 	mtx_lock(&mtx_callout);
 
 	if (c->entry.le_prev != NULL) {
 		LIST_REMOVE(c, entry);
 		c->entry.le_prev = NULL;
 	}
 	mtx_unlock(&mtx_callout);
 
 	c->c_func = NULL;
 	c->c_arg = NULL;
 }
 
 void
 callout_drain(struct callout *c)
 {
 	if (c->mtx == NULL)
 		return;			/* not initialised */
 
 	mtx_lock(c->mtx);
 	callout_stop(c);
 	mtx_unlock(c->mtx);
 }
 
 int
 callout_pending(struct callout *c)
 {
 	int retval;
 
 	mtx_lock(&mtx_callout);
 	retval = (c->entry.le_prev != NULL);
 	mtx_unlock(&mtx_callout);
 
 	return (retval);
 }
 
 /*------------------------------------------------------------------------*
  * Implementation of device API
  *------------------------------------------------------------------------*/
 
 static const char unknown_string[] = { "unknown" };
 
 static TAILQ_HEAD(, module_data) module_head =
     TAILQ_HEAD_INITIALIZER(module_head);
 
 static uint8_t
 devclass_equal(const char *a, const char *b)
 {
 	char ta, tb;
 
 	if (a == b)
 		return (1);
 
 	while (1) {
 		ta = *a;
 		tb = *b;
 		if (ta != tb)
 			return (0);
 		if (ta == 0)
 			break;
 		a++;
 		b++;
 	}
 	return (1);
 }
 
 int
 bus_generic_resume(device_t dev)
 {
 	return (0);
 }
 
 int
 bus_generic_shutdown(device_t dev)
 {
 	return (0);
 }
 
 int
 bus_generic_suspend(device_t dev)
 {
 	return (0);
 }
 
 int
 bus_generic_print_child(device_t dev, device_t child)
 {
 	return (0);
 }
 
 void
 bus_generic_driver_added(device_t dev, driver_t *driver)
 {
 	return;
 }
 
 device_t
 device_get_parent(device_t dev)
 {
 	return (dev ? dev->dev_parent : NULL);
 }
 
 void
 device_set_interrupt(device_t dev, driver_filter_t *filter,
     driver_intr_t *fn, void *arg)
 {
 	dev->dev_irq_filter = filter;
 	dev->dev_irq_fn = fn;
 	dev->dev_irq_arg = arg;
 }
 
 void
 device_run_interrupts(device_t parent)
 {
 	device_t child;
 
 	if (parent == NULL)
 		return;
 
 	TAILQ_FOREACH(child, &parent->dev_children, dev_link) {
 		int status;
 		if (child->dev_irq_filter != NULL)
 			status = child->dev_irq_filter(child->dev_irq_arg);
 		else
 			status = FILTER_SCHEDULE_THREAD;
 
 		if (status == FILTER_SCHEDULE_THREAD) {
 			if (child->dev_irq_fn != NULL)
 				(child->dev_irq_fn) (child->dev_irq_arg);
 		}
 	}
 }
 
 void
 device_set_ivars(device_t dev, void *ivars)
 {
 	dev->dev_aux = ivars;
 }
 
 void   *
 device_get_ivars(device_t dev)
 {
 	return (dev ? dev->dev_aux : NULL);
 }
 
 int
 device_get_unit(device_t dev)
 {
 	return (dev ? dev->dev_unit : 0);
 }
 
 int
 bus_generic_detach(device_t dev)
 {
 	device_t child;
 	int error;
 
 	if (!dev->dev_attached)
 		return (EBUSY);
 
 	TAILQ_FOREACH(child, &dev->dev_children, dev_link) {
 		if ((error = device_detach(child)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 const char *
 device_get_nameunit(device_t dev)
 {
 	if (dev && dev->dev_nameunit[0])
 		return (dev->dev_nameunit);
 
 	return (unknown_string);
 }
 
 static uint8_t
 devclass_create(devclass_t *dc_pp)
 {
 	if (dc_pp == NULL) {
 		return (1);
 	}
 	if (dc_pp[0] == NULL) {
 		dc_pp[0] = malloc(sizeof(**(dc_pp)),
 		    M_DEVBUF, M_WAITOK | M_ZERO);
 
 		if (dc_pp[0] == NULL) {
 			return (1);
 		}
 	}
 	return (0);
 }
 
 static const struct module_data *
 devclass_find_create(const char *classname)
 {
 	const struct module_data *mod;
 
 	TAILQ_FOREACH(mod, &module_head, entry) {
 		if (devclass_equal(mod->mod_name, classname)) {
 			if (devclass_create(mod->devclass_pp)) {
 				continue;
 			}
 			return (mod);
 		}
 	}
 	return (NULL);
 }
 
 static uint8_t
 devclass_add_device(const struct module_data *mod, device_t dev)
 {
 	device_t *pp_dev;
 	device_t *end;
 	uint8_t unit;
 
 	pp_dev = mod->devclass_pp[0]->dev_list;
 	end = pp_dev + DEVCLASS_MAXUNIT;
 	unit = 0;
 
 	while (pp_dev != end) {
 		if (*pp_dev == NULL) {
 			*pp_dev = dev;
 			dev->dev_unit = unit;
 			dev->dev_module = mod;
 			snprintf(dev->dev_nameunit,
 			    sizeof(dev->dev_nameunit),
 			    "%s%d", device_get_name(dev), unit);
 			return (0);
 		}
 		pp_dev++;
 		unit++;
 	}
 	DPRINTF("Could not add device to devclass.\n");
 	return (1);
 }
 
 static void
 devclass_delete_device(const struct module_data *mod, device_t dev)
 {
 	if (mod == NULL) {
 		return;
 	}
 	mod->devclass_pp[0]->dev_list[dev->dev_unit] = NULL;
 	dev->dev_module = NULL;
 }
 
 static device_t
 make_device(device_t parent, const char *name)
 {
 	device_t dev = NULL;
 	const struct module_data *mod = NULL;
 
 	if (name) {
 
 		mod = devclass_find_create(name);
 
 		if (!mod) {
 
 			DPRINTF("%s:%d:%s: can't find device "
 			    "class %s\n", __FILE__, __LINE__,
 			    __FUNCTION__, name);
 
 			goto done;
 		}
 	}
 	dev = malloc(sizeof(*dev),
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 
 	if (dev == NULL)
 		goto done;
 
 	dev->dev_parent = parent;
 	TAILQ_INIT(&dev->dev_children);
 
 	if (name) {
 		dev->dev_fixed_class = 1;
 		if (devclass_add_device(mod, dev)) {
 			goto error;
 		}
 	}
 done:
 	return (dev);
 
 error:
 	if (dev) {
 		free(dev, M_DEVBUF);
 	}
 	return (NULL);
 }
 
 device_t
 device_add_child(device_t dev, const char *name, int unit)
 {
 	device_t child;
 
 	if (unit != -1) {
 		device_printf(dev, "Unit is not -1\n");
 	}
 	child = make_device(dev, name);
 	if (child == NULL) {
 		device_printf(dev, "Could not add child '%s'\n", name);
 		goto done;
 	}
 	if (dev == NULL) {
 		/* no parent */
 		goto done;
 	}
 	TAILQ_INSERT_TAIL(&dev->dev_children, child, dev_link);
 done:
 	return (child);
 }
 
 int
 device_delete_child(device_t dev, device_t child)
 {
 	int error = 0;
 	device_t grandchild;
 
 	/* detach parent before deleting children, if any */
 	error = device_detach(child);
 	if (error)
 		goto done;
 
 	/* remove children second */
 	while ((grandchild = TAILQ_FIRST(&child->dev_children))) {
 		error = device_delete_child(child, grandchild);
 		if (error) {
 			device_printf(dev, "Error deleting child!\n");
 			goto done;
 		}
 	}
 
 	devclass_delete_device(child->dev_module, child);
 
 	if (dev != NULL) {
 		/* remove child from parent */
 		TAILQ_REMOVE(&dev->dev_children, child, dev_link);
 	}
 	free(child, M_DEVBUF);
 
 done:
 	return (error);
 }
 
 int
 device_delete_children(device_t dev)
 {
 	device_t child;
 	int error = 0;
 
 	while ((child = TAILQ_FIRST(&dev->dev_children))) {
 		error = device_delete_child(dev, child);
 		if (error) {
 			device_printf(dev, "Error deleting child!\n");
 			break;
 		}
 	}
 	return (error);
 }
 
 void
 device_quiet(device_t dev)
 {
 	dev->dev_quiet = 1;
 }
 
 const char *
 device_get_desc(device_t dev)
 {
 	if (dev)
 		return &(dev->dev_desc[0]);
 	return (unknown_string);
 }
 
 static int
 default_method(void)
 {
 	/* do nothing */
 	DPRINTF("Default method called\n");
 	return (0);
 }
 
 void   *
 device_get_method(device_t dev, const char *what)
 {
 	const struct device_method *mtod;
 
 	mtod = dev->dev_module->driver->methods;
 	while (mtod->func != NULL) {
 		if (devclass_equal(mtod->desc, what)) {
 			return (mtod->func);
 		}
 		mtod++;
 	}
 	return ((void *)&default_method);
 }
 
 const char *
 device_get_name(device_t dev)
 {
 	if (dev == NULL)
 		return (unknown_string);
 
 	return (dev->dev_module->driver->name);
 }
 
 static int
 device_allocate_softc(device_t dev)
 {
 	const struct module_data *mod;
 
 	mod = dev->dev_module;
 
 	if ((dev->dev_softc_alloc == 0) &&
 	    (mod->driver->size != 0)) {
 		dev->dev_sc = malloc(mod->driver->size,
 		    M_DEVBUF, M_WAITOK | M_ZERO);
 
 		if (dev->dev_sc == NULL)
 			return (ENOMEM);
 
 		dev->dev_softc_alloc = 1;
 	}
 	return (0);
 }
 
 int
 device_probe_and_attach(device_t dev)
 {
 	const struct module_data *mod;
 	const char *bus_name_parent;
 
 	bus_name_parent = device_get_name(device_get_parent(dev));
 
 	if (dev->dev_attached)
 		return (0);		/* fail-safe */
 
 	if (dev->dev_fixed_class) {
 
 		mod = dev->dev_module;
 
 		if (DEVICE_PROBE(dev) <= 0) {
 
 			if (device_allocate_softc(dev) == 0) {
 
 				if (DEVICE_ATTACH(dev) == 0) {
 					/* success */
 					dev->dev_attached = 1;
 					return (0);
 				}
 			}
 		}
 		device_detach(dev);
 
 		goto error;
 	}
 	/*
          * Else find a module for our device, if any
          */
 
 	TAILQ_FOREACH(mod, &module_head, entry) {
 		if (devclass_equal(mod->bus_name, bus_name_parent)) {
 			if (devclass_create(mod->devclass_pp)) {
 				continue;
 			}
 			if (devclass_add_device(mod, dev)) {
 				continue;
 			}
 			if (DEVICE_PROBE(dev) <= 0) {
 
 				if (device_allocate_softc(dev) == 0) {
 
 					if (DEVICE_ATTACH(dev) == 0) {
 						/* success */
 						dev->dev_attached = 1;
 						return (0);
 					}
 				}
 			}
 			/* else try next driver */
 
 			device_detach(dev);
 		}
 	}
 
 error:
 	return (ENODEV);
 }
 
 int
 device_detach(device_t dev)
 {
 	const struct module_data *mod = dev->dev_module;
 	int error;
 
 	if (dev->dev_attached) {
 
 		error = DEVICE_DETACH(dev);
 		if (error) {
 			return error;
 		}
 		dev->dev_attached = 0;
 	}
 	device_set_softc(dev, NULL);
 
 	if (dev->dev_fixed_class == 0)
 		devclass_delete_device(mod, dev);
 
 	return (0);
 }
 
 void
 device_set_softc(device_t dev, void *softc)
 {
 	if (dev->dev_softc_alloc) {
 		free(dev->dev_sc, M_DEVBUF);
 		dev->dev_sc = NULL;
 	}
 	dev->dev_sc = softc;
 	dev->dev_softc_alloc = 0;
 }
 
 void   *
 device_get_softc(device_t dev)
 {
 	if (dev == NULL)
 		return (NULL);
 
 	return (dev->dev_sc);
 }
 
 int
 device_is_attached(device_t dev)
 {
 	return (dev->dev_attached);
 }
 
 void
 device_set_desc(device_t dev, const char *desc)
 {
 	snprintf(dev->dev_desc, sizeof(dev->dev_desc), "%s", desc);
 }
 
 void
 device_set_desc_copy(device_t dev, const char *desc)
 {
 	device_set_desc(dev, desc);
 }
 
 void   *
 devclass_get_softc(devclass_t dc, int unit)
 {
 	return (device_get_softc(devclass_get_device(dc, unit)));
 }
 
 int
 devclass_get_maxunit(devclass_t dc)
 {
 	int max_unit = 0;
 
 	if (dc) {
 		max_unit = DEVCLASS_MAXUNIT;
 		while (max_unit--) {
 			if (dc->dev_list[max_unit]) {
 				break;
 			}
 		}
 		max_unit++;
 	}
 	return (max_unit);
 }
 
 device_t
 devclass_get_device(devclass_t dc, int unit)
 {
 	return (((unit < 0) || (unit >= DEVCLASS_MAXUNIT) || (dc == NULL)) ?
 	    NULL : dc->dev_list[unit]);
 }
 
 devclass_t
 devclass_find(const char *classname)
 {
 	const struct module_data *mod;
 
 	TAILQ_FOREACH(mod, &module_head, entry) {
 		if (devclass_equal(mod->driver->name, classname))
 			return (mod->devclass_pp[0]);
 	}
 	return (NULL);
 }
 
 void
 module_register(void *data)
 {
 	struct module_data *mdata = data;
 
 	TAILQ_INSERT_TAIL(&module_head, mdata, entry);
 }
 
 /*------------------------------------------------------------------------*
  * System startup
  *------------------------------------------------------------------------*/
 
 static void
 sysinit_run(const void **ppdata)
 {
 	const struct sysinit *psys;
 
 	while ((psys = *ppdata) != NULL) {
 		(psys->func) (psys->data);
 		ppdata++;
 	}
 }
 
 /*------------------------------------------------------------------------*
  * USB process API
  *------------------------------------------------------------------------*/
 
 static int usb_do_process(struct usb_process *);
 static int usb_proc_level = -1;
 static struct mtx usb_proc_mtx;
 
 void
 usb_idle(void)
 {
 	int old_level = usb_proc_level;
 	int old_giant = Giant.owned;
 	int worked;
 
 	device_run_interrupts(usb_pci_root);
 
 	do {
 		worked = 0;
 		Giant.owned = 0;
 
 		while (++usb_proc_level < USB_PROC_MAX)
 			worked |= usb_do_process(usb_process + usb_proc_level);
 
 		usb_proc_level = old_level;
 		Giant.owned = old_giant;
 
 	} while (worked);
 }
 
 void
 usb_init(void)
 {
 	sysinit_run(sysinit_data);
 }
 
 void
 usb_uninit(void)
 {
 	sysinit_run(sysuninit_data);
 }
 
 static void
 usb_process_init_sub(struct usb_process *up)
 {
 	TAILQ_INIT(&up->up_qhead);
 
 	cv_init(&up->up_cv, "-");
 	cv_init(&up->up_drain, "usbdrain");
 
 	up->up_mtx = &usb_proc_mtx;
 }
 
 static void
 usb_process_init(void *arg)
 {
 	uint8_t x;
 
 	mtx_init(&usb_proc_mtx, "usb-proc-mtx", NULL, MTX_DEF | MTX_RECURSE);
 
 	for (x = 0; x != USB_PROC_MAX; x++)
 		usb_process_init_sub(&usb_process[x]);
 
 }
 SYSINIT(usb_process_init, SI_SUB_LOCK, SI_ORDER_MIDDLE, usb_process_init, NULL);
 
 static int
 usb_do_process(struct usb_process *up)
 {
 	struct usb_proc_msg *pm;
 	int worked = 0;
 
 	mtx_lock(&usb_proc_mtx);
 
 repeat:
 	pm = TAILQ_FIRST(&up->up_qhead);
 
 	if (pm != NULL) {
 
 		worked = 1;
 
 		(pm->pm_callback) (pm);
 
 		if (pm == TAILQ_FIRST(&up->up_qhead)) {
 			/* nothing changed */
 			TAILQ_REMOVE(&up->up_qhead, pm, pm_qentry);
 			pm->pm_qentry.tqe_prev = NULL;
 		}
 		goto repeat;
 	}
 	mtx_unlock(&usb_proc_mtx);
 
 	return (worked);
 }
 
 void   *
 usb_proc_msignal(struct usb_process *up, void *_pm0, void *_pm1)
 {
 	struct usb_proc_msg *pm0 = _pm0;
 	struct usb_proc_msg *pm1 = _pm1;
 	struct usb_proc_msg *pm2;
 	usb_size_t d;
 	uint8_t t;
 
 	t = 0;
 
 	if (pm0->pm_qentry.tqe_prev) {
 		t |= 1;
 	}
 	if (pm1->pm_qentry.tqe_prev) {
 		t |= 2;
 	}
 	if (t == 0) {
 		/*
 		 * No entries are queued. Queue "pm0" and use the existing
 		 * message number.
 		 */
 		pm2 = pm0;
 	} else if (t == 1) {
 		/* Check if we need to increment the message number. */
 		if (pm0->pm_num == up->up_msg_num) {
 			up->up_msg_num++;
 		}
 		pm2 = pm1;
 	} else if (t == 2) {
 		/* Check if we need to increment the message number. */
 		if (pm1->pm_num == up->up_msg_num) {
 			up->up_msg_num++;
 		}
 		pm2 = pm0;
 	} else if (t == 3) {
 		/*
 		 * Both entries are queued. Re-queue the entry closest to
 		 * the end.
 		 */
 		d = (pm1->pm_num - pm0->pm_num);
 
 		/* Check sign after subtraction */
 		if (d & 0x80000000) {
 			pm2 = pm0;
 		} else {
 			pm2 = pm1;
 		}
 
 		TAILQ_REMOVE(&up->up_qhead, pm2, pm_qentry);
 	} else {
 		pm2 = NULL;		/* panic - should not happen */
 	}
 
 	/* Put message last on queue */
 
 	pm2->pm_num = up->up_msg_num;
 	TAILQ_INSERT_TAIL(&up->up_qhead, pm2, pm_qentry);
 
 	return (pm2);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_proc_is_gone
  *
  * Return values:
  *    0: USB process is running
  * Else: USB process is tearing down
  *------------------------------------------------------------------------*/
 uint8_t
 usb_proc_is_gone(struct usb_process *up)
 {
 	return (0);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_proc_mwait
  *
  * This function will return when the USB process message pointed to
  * by "pm" is no longer on a queue. This function must be called
  * having "usb_proc_mtx" locked.
  *------------------------------------------------------------------------*/
 void
 usb_proc_mwait(struct usb_process *up, void *_pm0, void *_pm1)
 {
 	struct usb_proc_msg *pm0 = _pm0;
 	struct usb_proc_msg *pm1 = _pm1;
 
 	/* Just remove the messages from the queue. */
 	if (pm0->pm_qentry.tqe_prev) {
 		TAILQ_REMOVE(&up->up_qhead, pm0, pm_qentry);
 		pm0->pm_qentry.tqe_prev = NULL;
 	}
 	if (pm1->pm_qentry.tqe_prev) {
 		TAILQ_REMOVE(&up->up_qhead, pm1, pm_qentry);
 		pm1->pm_qentry.tqe_prev = NULL;
 	}
 }
 
 /*------------------------------------------------------------------------*
  * SYSTEM attach
  *------------------------------------------------------------------------*/
 
 #ifdef USB_PCI_PROBE_LIST
 static device_method_t pci_methods[] = {
 	DEVMETHOD_END
 };
 
 static driver_t pci_driver = {
 	.name = "pci",
 	.methods = pci_methods,
 };
 
 static devclass_t pci_devclass;
 
 DRIVER_MODULE(pci, pci, pci_driver, pci_devclass, 0, 0);
 
 static const char *usb_pci_devices[] = {
 	USB_PCI_PROBE_LIST
 };
 
 #define	USB_PCI_USB_MAX	(sizeof(usb_pci_devices) / sizeof(void *))
 
 static device_t usb_pci_dev[USB_PCI_USB_MAX];
 
 static void
 usb_pci_mod_load(void *arg)
 {
 	uint32_t x;
 
 	usb_pci_root = device_add_child(NULL, "pci", -1);
 	if (usb_pci_root == NULL)
 		return;
 
 	for (x = 0; x != USB_PCI_USB_MAX; x++) {
 		usb_pci_dev[x] = device_add_child(usb_pci_root, usb_pci_devices[x], -1);
 		if (usb_pci_dev[x] == NULL)
 			continue;
 		if (device_probe_and_attach(usb_pci_dev[x])) {
 			device_printf(usb_pci_dev[x],
 			    "WARNING: Probe and attach failed!\n");
 		}
 	}
 }
 SYSINIT(usb_pci_mod_load, SI_SUB_RUN_SCHEDULER, SI_ORDER_MIDDLE, usb_pci_mod_load, 0);
 
 static void
 usb_pci_mod_unload(void *arg)
 {
 	uint32_t x;
 
 	for (x = 0; x != USB_PCI_USB_MAX; x++) {
 		if (usb_pci_dev[x]) {
 			device_detach(usb_pci_dev[x]);
 			device_delete_child(usb_pci_root, usb_pci_dev[x]);
 		}
 	}
 	if (usb_pci_root)
 		device_delete_child(NULL, usb_pci_root);
 }
 SYSUNINIT(usb_pci_mod_unload, SI_SUB_RUN_SCHEDULER, SI_ORDER_MIDDLE, usb_pci_mod_unload, 0);
 #endif
 
 /*------------------------------------------------------------------------*
  * MALLOC API
  *------------------------------------------------------------------------*/
 
 #ifndef HAVE_MALLOC
 #define	USB_POOL_ALIGN 8
 
 static uint8_t usb_pool[USB_POOL_SIZE] __aligned(USB_POOL_ALIGN);
 static uint32_t usb_pool_rem = USB_POOL_SIZE;
 static uint32_t usb_pool_entries;
 
 struct malloc_hdr {
 	TAILQ_ENTRY(malloc_hdr) entry;
 	uint32_t size;
 } __aligned(USB_POOL_ALIGN);
 
 static TAILQ_HEAD(, malloc_hdr) malloc_head =
 	TAILQ_HEAD_INITIALIZER(malloc_head);
 
 void   *
 usb_malloc(unsigned long size)
 {
 	struct malloc_hdr *hdr;
 
 	size = (size + USB_POOL_ALIGN - 1) & ~(USB_POOL_ALIGN - 1);
 	size += sizeof(struct malloc_hdr);
 
 	TAILQ_FOREACH(hdr, &malloc_head, entry) {
 		if (hdr->size == size)
 			break;
 	}
 
 	if (hdr) {
 		DPRINTF("MALLOC: Entries = %d; Remainder = %d; Size = %d\n",
 		    (int)usb_pool_entries, (int)usb_pool_rem, (int)size);
 
 		TAILQ_REMOVE(&malloc_head, hdr, entry);
 		memset(hdr + 1, 0, hdr->size - sizeof(*hdr));
 		return (hdr + 1);
 	}
 	if (usb_pool_rem >= size) {
 		hdr = (void *)(usb_pool + USB_POOL_SIZE - usb_pool_rem);
 		hdr->size = size;
 
 		usb_pool_rem -= size;
 		usb_pool_entries++;
 
 		DPRINTF("MALLOC: Entries = %d; Remainder = %d; Size = %d\n",
 		    (int)usb_pool_entries, (int)usb_pool_rem, (int)size);
 
 		memset(hdr + 1, 0, hdr->size - sizeof(*hdr));
 		return (hdr + 1);
 	}
 	return (NULL);
 }
 
 void
 usb_free(void *arg)
 {
 	struct malloc_hdr *hdr;
 
 	if (arg == NULL)
 		return;
 
 	hdr = arg;
 	hdr--;
 
 	TAILQ_INSERT_TAIL(&malloc_head, hdr, entry);
 }
 #endif
 
 char   *
 usb_strdup(const char *str)
 {
 	char *tmp;
 	int len;
 
 	len = 1 + strlen(str);
 
 	tmp = malloc(len,XXX,XXX);
 	if (tmp == NULL)
 		return (NULL);
 
 	memcpy(tmp, str, len);
 	return (tmp);
 }
Index: head/sys/boot/ofw/libofw/ofw_memory.c
===================================================================
--- head/sys/boot/ofw/libofw/ofw_memory.c	(revision 314067)
+++ head/sys/boot/ofw/libofw/ofw_memory.c	(revision 314068)
@@ -1,146 +1,146 @@
 /*-
  * Copyright (c) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 
 #include <stand.h>
 
 #include "libofw.h"
 #include "openfirm.h"
 
-static void		*heap_base = 0;
+static void		*heap_base = NULL;
 static unsigned int	heap_size = 0;
 
 struct ofw_mapping {
         vm_offset_t     va;
         int             len;
         vm_offset_t     pa;
         int             mode;
 };
 
 struct ofw_mapping2 {
         vm_offset_t     va;
         int             len;
         vm_offset_t     pa_hi;
         vm_offset_t     pa_lo;
         int             mode;
 };
 
 void
 ofw_memmap(int acells)
 {
 	struct		ofw_mapping *mapptr;
 	struct		ofw_mapping2 *mapptr2;
         phandle_t	mmup;
         int		nmapping, i;
         u_char		mappings[256 * sizeof(struct ofw_mapping2)];
         char		lbuf[80];
 
 	mmup = OF_instance_to_package(mmu);
 
 	bzero(mappings, sizeof(mappings));
 
 	nmapping = OF_getprop(mmup, "translations", mappings, sizeof(mappings));
 	if (nmapping == -1) {
 		printf("Could not get memory map (%d)\n",
 		    nmapping);
 		return;
 	}
 
 	pager_open();
 	if (acells == 1) {
 		nmapping /= sizeof(struct ofw_mapping);
 		mapptr = (struct ofw_mapping *) mappings;
 
 		printf("%17s\t%17s\t%8s\t%6s\n", "Virtual Range",
 		    "Physical Range", "#Pages", "Mode");
 
 		for (i = 0; i < nmapping; i++) {
 			sprintf(lbuf, "%08x-%08x\t%08x-%08x\t%8d\t%6x\n",
 				mapptr[i].va,
 				mapptr[i].va + mapptr[i].len,
 				mapptr[i].pa,
 				mapptr[i].pa + mapptr[i].len,
 				mapptr[i].len / 0x1000,
 				mapptr[i].mode);
 			if (pager_output(lbuf))
 				break;
 		}
 	} else {
 		nmapping /= sizeof(struct ofw_mapping2);
 		mapptr2 = (struct ofw_mapping2 *) mappings;
 
 		printf("%17s\t%17s\t%8s\t%6s\n", "Virtual Range",
 		       "Physical Range", "#Pages", "Mode");
 
 		for (i = 0; i < nmapping; i++) {
 			sprintf(lbuf, "%08x-%08x\t%08x-%08x\t%8d\t%6x\n",
 				mapptr2[i].va,
 				mapptr2[i].va + mapptr2[i].len,
 				mapptr2[i].pa_lo,
 				mapptr2[i].pa_lo + mapptr2[i].len,
 				mapptr2[i].len / 0x1000,
 				mapptr2[i].mode);
 			if (pager_output(lbuf))
 				break;
 		}
 	}
 	pager_close();
 }
 
 void *
 ofw_alloc_heap(unsigned int size)
 {
 	phandle_t	memoryp, root;
 	cell_t		available[4];
 	cell_t		acells;
 
 	root = OF_finddevice("/");
 	acells = 1;
 	OF_getprop(root, "#address-cells", &acells, sizeof(acells));
 
 	memoryp = OF_instance_to_package(memory);
 	OF_getprop(memoryp, "available", available, sizeof(available));
 
 	heap_base = OF_claim((void *)available[acells-1], size,
 	    sizeof(register_t));
 
 	if (heap_base != (void *)-1) {
 		heap_size = size;
 	}
 
 	return (heap_base);
 }
 
 void
 ofw_release_heap(void)
 {
 	OF_release(heap_base, heap_size);
 }
Index: head/sys/boot/sparc64/loader/main.c
===================================================================
--- head/sys/boot/sparc64/loader/main.c	(revision 314067)
+++ head/sys/boot/sparc64/loader/main.c	(revision 314068)
@@ -1,992 +1,992 @@
 /*-
  * Initial implementation:
  * Copyright (c) 2001 Robert Drehmel
  * All rights reserved.
  *
  * As long as the above copyright statement and this notice remain
  * unchanged, you can do what ever you want with this file.
  */
 /*-
  * Copyright (c) 2008 - 2012 Marius Strobl <marius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * FreeBSD/sparc64 kernel loader - machine dependent part
  *
  *  - implements copyin and readin functions that map kernel
  *    pages on demand.  The machine independent code does not
  *    know the size of the kernel early enough to pre-enter
  *    TTEs and install just one 4MB mapping seemed to limiting
  *    to me.
  */
 
 #include <stand.h>
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/linker.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 #ifdef LOADER_ZFS_SUPPORT
 #include <sys/vtoc.h>
 #include "../zfs/libzfs.h"
 #endif
 
 #include <vm/vm.h>
 #include <machine/asi.h>
 #include <machine/cmt.h>
 #include <machine/cpufunc.h>
 #include <machine/elf.h>
 #include <machine/fireplane.h>
 #include <machine/jbus.h>
 #include <machine/lsu.h>
 #include <machine/metadata.h>
 #include <machine/tte.h>
 #include <machine/tlb.h>
 #include <machine/upa.h>
 #include <machine/ver.h>
 #include <machine/vmparam.h>
 
 #include "bootstrap.h"
 #include "libofw.h"
 #include "dev_net.h"
 
 extern char bootprog_info[];
 
 enum {
 	HEAPVA		= 0x800000,
 	HEAPSZ		= 0x1000000,
 	LOADSZ		= 0x1000000	/* for kernel and modules */
 };
 
 /* At least Sun Fire V1280 require page sized allocations to be claimed. */
 CTASSERT(HEAPSZ % PAGE_SIZE == 0);
 
 static struct mmu_ops {
 	void (*tlb_init)(void);
 	int (*mmu_mapin)(vm_offset_t va, vm_size_t len);
 } *mmu_ops;
 
 typedef void kernel_entry_t(vm_offset_t mdp, u_long o1, u_long o2, u_long o3,
     void *openfirmware);
 
 static inline u_long dtlb_get_data_sun4u(u_int, u_int);
 static int dtlb_enter_sun4u(u_int, u_long data, vm_offset_t);
 static vm_offset_t dtlb_va_to_pa_sun4u(vm_offset_t);
 static inline u_long itlb_get_data_sun4u(u_int, u_int);
 static int itlb_enter_sun4u(u_int, u_long data, vm_offset_t);
 static vm_offset_t itlb_va_to_pa_sun4u(vm_offset_t);
 static void itlb_relocate_locked0_sun4u(void);
 extern vm_offset_t md_load(char *, vm_offset_t *, vm_offset_t *);
 static int sparc64_autoload(void);
 static ssize_t sparc64_readin(const int, vm_offset_t, const size_t);
 static ssize_t sparc64_copyin(const void *, vm_offset_t, size_t);
 static vm_offset_t claim_virt(vm_offset_t, size_t, int);
 static vm_offset_t alloc_phys(size_t, int);
 static int map_phys(int, size_t, vm_offset_t, vm_offset_t);
 static void release_phys(vm_offset_t, u_int);
 static int __elfN(exec)(struct preloaded_file *);
 static int mmu_mapin_sun4u(vm_offset_t, vm_size_t);
 static vm_offset_t init_heap(void);
 static phandle_t find_bsp_sun4u(phandle_t, uint32_t);
 const char *cpu_cpuid_prop_sun4u(void);
 uint32_t cpu_get_mid_sun4u(void);
 static void tlb_init_sun4u(void);
 
 #ifdef LOADER_DEBUG
 typedef u_int64_t tte_t;
 
 static void pmap_print_tlb_sun4u(void);
 static void pmap_print_tte_sun4u(tte_t, tte_t);
 #endif
 
 static struct mmu_ops mmu_ops_sun4u = { tlb_init_sun4u, mmu_mapin_sun4u };
 
 /* sun4u */
 struct tlb_entry *dtlb_store;
 struct tlb_entry *itlb_store;
 u_int dtlb_slot;
 u_int itlb_slot;
 static int cpu_impl;
 static u_int dtlb_slot_max;
 static u_int itlb_slot_max;
 static u_int tlb_locked;
 
 static vm_offset_t curkva = 0;
 static vm_offset_t heapva;
 
 static char bootpath[64];
 static phandle_t root;
 
 #ifdef LOADER_ZFS_SUPPORT
 static struct zfs_devdesc zfs_currdev;
 #endif
 
 /*
  * Machine dependent structures that the machine independent
  * loader part uses.
  */
 struct devsw *devsw[] = {
 #ifdef LOADER_DISK_SUPPORT
 	&ofwdisk,
 #endif
 #ifdef LOADER_NET_SUPPORT
 	&netdev,
 #endif
 #ifdef LOADER_ZFS_SUPPORT
 	&zfs_dev,
 #endif
 	NULL
 };
 
 struct arch_switch archsw;
 
 static struct file_format sparc64_elf = {
 	__elfN(loadfile),
 	__elfN(exec)
 };
 
 struct file_format *file_formats[] = {
 	&sparc64_elf,
 	NULL
 };
 
 struct fs_ops *file_system[] = {
 #ifdef LOADER_ZFS_SUPPORT
 	&zfs_fsops,
 #endif
 #ifdef LOADER_UFS_SUPPORT
 	&ufs_fsops,
 #endif
 #ifdef LOADER_CD9660_SUPPORT
 	&cd9660_fsops,
 #endif
 #ifdef LOADER_ZIP_SUPPORT
 	&zipfs_fsops,
 #endif
 #ifdef LOADER_GZIP_SUPPORT
 	&gzipfs_fsops,
 #endif
 #ifdef LOADER_BZIP2_SUPPORT
 	&bzipfs_fsops,
 #endif
 #ifdef LOADER_NFS_SUPPORT
 	&nfs_fsops,
 #endif
 #ifdef LOADER_TFTP_SUPPORT
 	&tftp_fsops,
 #endif
 	NULL
 };
 
 struct netif_driver *netif_drivers[] = {
 #ifdef LOADER_NET_SUPPORT
 	&ofwnet,
 #endif
 	NULL
 };
 
 extern struct console ofwconsole;
 struct console *consoles[] = {
 	&ofwconsole,
 	NULL
 };
 
 #ifdef LOADER_DEBUG
 static int
 watch_phys_set_mask(vm_offset_t pa, u_long mask)
 {
 	u_long lsucr;
 
 	stxa(AA_DMMU_PWPR, ASI_DMMU, pa & (((2UL << 38) - 1) << 3));
 	lsucr = ldxa(0, ASI_LSU_CTL_REG);
 	lsucr = ((lsucr | LSU_PW) & ~LSU_PM_MASK) |
 	    (mask << LSU_PM_SHIFT);
 	stxa(0, ASI_LSU_CTL_REG, lsucr);
 	return (0);
 }
 
 static int
 watch_phys_set(vm_offset_t pa, int sz)
 {
 	u_long off;
 
 	off = (u_long)pa & 7;
 	/* Test for misaligned watch points. */
 	if (off + sz > 8)
 		return (-1);
 	return (watch_phys_set_mask(pa, ((1 << sz) - 1) << off));
 }
 
 
 static int
 watch_virt_set_mask(vm_offset_t va, u_long mask)
 {
 	u_long lsucr;
 
 	stxa(AA_DMMU_VWPR, ASI_DMMU, va & (((2UL << 41) - 1) << 3));
 	lsucr = ldxa(0, ASI_LSU_CTL_REG);
 	lsucr = ((lsucr | LSU_VW) & ~LSU_VM_MASK) |
 	    (mask << LSU_VM_SHIFT);
 	stxa(0, ASI_LSU_CTL_REG, lsucr);
 	return (0);
 }
 
 static int
 watch_virt_set(vm_offset_t va, int sz)
 {
 	u_long off;
 
 	off = (u_long)va & 7;
 	/* Test for misaligned watch points. */
 	if (off + sz > 8)
 		return (-1);
 	return (watch_virt_set_mask(va, ((1 << sz) - 1) << off));
 }
 #endif
 
 /*
  * archsw functions
  */
 static int
 sparc64_autoload(void)
 {
 
 	return (0);
 }
 
 static ssize_t
 sparc64_readin(const int fd, vm_offset_t va, const size_t len)
 {
 
 	mmu_ops->mmu_mapin(va, len);
 	return (read(fd, (void *)va, len));
 }
 
 static ssize_t
 sparc64_copyin(const void *src, vm_offset_t dest, size_t len)
 {
 
 	mmu_ops->mmu_mapin(dest, len);
 	memcpy((void *)dest, src, len);
 	return (len);
 }
 
 /*
  * other MD functions
  */
 static vm_offset_t
 claim_virt(vm_offset_t virt, size_t size, int align)
 {
 	vm_offset_t mva;
 
 	if (OF_call_method("claim", mmu, 3, 1, virt, size, align, &mva) == -1)
 		return ((vm_offset_t)-1);
 	return (mva);
 }
 
 static vm_offset_t
 alloc_phys(size_t size, int align)
 {
 	cell_t phys_hi, phys_low;
 
 	if (OF_call_method("claim", memory, 2, 2, size, align, &phys_low,
 	    &phys_hi) == -1)
 		return ((vm_offset_t)-1);
 	return ((vm_offset_t)phys_hi << 32 | phys_low);
 }
 
 static int
 map_phys(int mode, size_t size, vm_offset_t virt, vm_offset_t phys)
 {
 
 	return (OF_call_method("map", mmu, 5, 0, (uint32_t)phys,
 	    (uint32_t)(phys >> 32), virt, size, mode));
 }
 
 static void
 release_phys(vm_offset_t phys, u_int size)
 {
 
 	(void)OF_call_method("release", memory, 3, 0, (uint32_t)phys,
 	    (uint32_t)(phys >> 32), size);
 }
 
 static int
 __elfN(exec)(struct preloaded_file *fp)
 {
 	struct file_metadata *fmp;
 	vm_offset_t mdp, dtbp;
 	Elf_Addr entry;
 	Elf_Ehdr *e;
 	int error;
 
 	if ((fmp = file_findmetadata(fp, MODINFOMD_ELFHDR)) == 0)
 		return (EFTYPE);
 	e = (Elf_Ehdr *)&fmp->md_data;
 
 	if ((error = md_load(fp->f_args, &mdp, &dtbp)) != 0)
 		return (error);
 
 	printf("jumping to kernel entry at %#lx.\n", e->e_entry);
 #ifdef LOADER_DEBUG
 	pmap_print_tlb_sun4u();
 #endif
 
 	dev_cleanup();
 
 	entry = e->e_entry;
 
 	OF_release((void *)heapva, HEAPSZ);
 
 	((kernel_entry_t *)entry)(mdp, 0, 0, 0, openfirmware);
 
 	panic("%s: exec returned", __func__);
 }
 
 static inline u_long
 dtlb_get_data_sun4u(u_int tlb, u_int slot)
 {
 	u_long data, pstate;
 
 	slot = TLB_DAR_SLOT(tlb, slot);
 	/*
 	 * We read ASI_DTLB_DATA_ACCESS_REG twice back-to-back in order to
 	 * work around errata of USIII and beyond.
 	 */
 	pstate = rdpr(pstate);
 	wrpr(pstate, pstate & ~PSTATE_IE, 0);
 	(void)ldxa(slot, ASI_DTLB_DATA_ACCESS_REG);
 	data = ldxa(slot, ASI_DTLB_DATA_ACCESS_REG);
 	wrpr(pstate, pstate, 0);
 	return (data);
 }
 
 static inline u_long
 itlb_get_data_sun4u(u_int tlb, u_int slot)
 {
 	u_long data, pstate;
 
 	slot = TLB_DAR_SLOT(tlb, slot);
 	/*
 	 * We read ASI_DTLB_DATA_ACCESS_REG twice back-to-back in order to
 	 * work around errata of USIII and beyond.
 	 */
 	pstate = rdpr(pstate);
 	wrpr(pstate, pstate & ~PSTATE_IE, 0);
 	(void)ldxa(slot, ASI_ITLB_DATA_ACCESS_REG);
 	data = ldxa(slot, ASI_ITLB_DATA_ACCESS_REG);
 	wrpr(pstate, pstate, 0);
 	return (data);
 }
 
 static vm_offset_t
 dtlb_va_to_pa_sun4u(vm_offset_t va)
 {
 	u_long pstate, reg;
 	u_int i, tlb;
 
 	pstate = rdpr(pstate);
 	wrpr(pstate, pstate & ~PSTATE_IE, 0);
 	for (i = 0; i < dtlb_slot_max; i++) {
 		reg = ldxa(TLB_DAR_SLOT(tlb_locked, i),
 		    ASI_DTLB_TAG_READ_REG);
 		if (TLB_TAR_VA(reg) != va)
 			continue;
 		reg = dtlb_get_data_sun4u(tlb_locked, i);
 		wrpr(pstate, pstate, 0);
 		reg >>= TD_PA_SHIFT;
 		if (cpu_impl == CPU_IMPL_SPARC64V ||
 		    cpu_impl >= CPU_IMPL_ULTRASPARCIII)
 			return (reg & TD_PA_CH_MASK);
 		return (reg & TD_PA_SF_MASK);
 	}
 	wrpr(pstate, pstate, 0);
 	return (-1);
 }
 
 static vm_offset_t
 itlb_va_to_pa_sun4u(vm_offset_t va)
 {
 	u_long pstate, reg;
 	int i;
 
 	pstate = rdpr(pstate);
 	wrpr(pstate, pstate & ~PSTATE_IE, 0);
 	for (i = 0; i < itlb_slot_max; i++) {
 		reg = ldxa(TLB_DAR_SLOT(tlb_locked, i),
 		    ASI_ITLB_TAG_READ_REG);
 		if (TLB_TAR_VA(reg) != va)
 			continue;
 		reg = itlb_get_data_sun4u(tlb_locked, i);
 		wrpr(pstate, pstate, 0);
 		reg >>= TD_PA_SHIFT;
 		if (cpu_impl == CPU_IMPL_SPARC64V ||
 		    cpu_impl >= CPU_IMPL_ULTRASPARCIII)
 			return (reg & TD_PA_CH_MASK);
 		return (reg & TD_PA_SF_MASK);
 	}
 	wrpr(pstate, pstate, 0);
 	return (-1);
 }
 
 static int
 dtlb_enter_sun4u(u_int index, u_long data, vm_offset_t virt)
 {
 
 	return (OF_call_method("SUNW,dtlb-load", mmu, 3, 0, index, data,
 	    virt));
 }
 
 static int
 itlb_enter_sun4u(u_int index, u_long data, vm_offset_t virt)
 {
 
 	if (cpu_impl == CPU_IMPL_ULTRASPARCIIIp && index == 0 &&
 	    (data & TD_L) != 0)
 		panic("%s: won't enter locked TLB entry at index 0 on USIII+",
 		    __func__);
 	return (OF_call_method("SUNW,itlb-load", mmu, 3, 0, index, data,
 	    virt));
 }
 
 static void
 itlb_relocate_locked0_sun4u(void)
 {
 	u_long data, pstate, tag;
 	int i;
 
 	if (cpu_impl != CPU_IMPL_ULTRASPARCIIIp)
 		return;
 
 	pstate = rdpr(pstate);
 	wrpr(pstate, pstate & ~PSTATE_IE, 0);
 
 	data = itlb_get_data_sun4u(tlb_locked, 0);
 	if ((data & (TD_V | TD_L)) != (TD_V | TD_L)) {
 		wrpr(pstate, pstate, 0);
 		return;
 	}
 
 	/* Flush the mapping of slot 0. */
 	tag = ldxa(TLB_DAR_SLOT(tlb_locked, 0), ASI_ITLB_TAG_READ_REG);
 	stxa(TLB_DEMAP_VA(TLB_TAR_VA(tag)) | TLB_DEMAP_PRIMARY |
 	    TLB_DEMAP_PAGE, ASI_IMMU_DEMAP, 0);
 	flush(0);	/* The USIII-family ignores the address. */
 
 	/*
 	 * Search a replacement slot != 0 and enter the data and tag
 	 * that formerly were in slot 0.
 	 */
 	for (i = 1; i < itlb_slot_max; i++) {
 		if ((itlb_get_data_sun4u(tlb_locked, i) & TD_V) != 0)
 			continue;
 
 		stxa(AA_IMMU_TAR, ASI_IMMU, tag);
 		stxa(TLB_DAR_SLOT(tlb_locked, i), ASI_ITLB_DATA_ACCESS_REG,
 		    data);
 		flush(0);	/* The USIII-family ignores the address. */
 		break;
 	}
 	wrpr(pstate, pstate, 0);
 	if (i == itlb_slot_max)
 		panic("%s: could not find a replacement slot", __func__);
 }
 
 static int
 mmu_mapin_sun4u(vm_offset_t va, vm_size_t len)
 {
 	vm_offset_t pa, mva;
 	u_long data;
 	u_int index;
 
 	if (va + len > curkva)
 		curkva = va + len;
 
 	pa = (vm_offset_t)-1;
 	len += va & PAGE_MASK_4M;
 	va &= ~PAGE_MASK_4M;
 	while (len) {
 		if (dtlb_va_to_pa_sun4u(va) == (vm_offset_t)-1 ||
 		    itlb_va_to_pa_sun4u(va) == (vm_offset_t)-1) {
 			/* Allocate a physical page, claim the virtual area. */
 			if (pa == (vm_offset_t)-1) {
 				pa = alloc_phys(PAGE_SIZE_4M, PAGE_SIZE_4M);
 				if (pa == (vm_offset_t)-1)
 					panic("%s: out of memory", __func__);
 				mva = claim_virt(va, PAGE_SIZE_4M, 0);
 				if (mva != va)
 					panic("%s: can't claim virtual page "
 					    "(wanted %#lx, got %#lx)",
 					    __func__, va, mva);
 				/*
 				 * The mappings may have changed, be paranoid.
 				 */
 				continue;
 			}
 			/*
 			 * Actually, we can only allocate two pages less at
 			 * most (depending on the kernel TSB size).
 			 */
 			if (dtlb_slot >= dtlb_slot_max)
 				panic("%s: out of dtlb_slots", __func__);
 			if (itlb_slot >= itlb_slot_max)
 				panic("%s: out of itlb_slots", __func__);
 			data = TD_V | TD_4M | TD_PA(pa) | TD_L | TD_CP |
 			    TD_CV | TD_P | TD_W;
 			dtlb_store[dtlb_slot].te_pa = pa;
 			dtlb_store[dtlb_slot].te_va = va;
 			index = dtlb_slot_max - dtlb_slot - 1;
 			if (dtlb_enter_sun4u(index, data, va) < 0)
 				panic("%s: can't enter dTLB slot %d data "
 				    "%#lx va %#lx", __func__, index, data,
 				    va);
 			dtlb_slot++;
 			itlb_store[itlb_slot].te_pa = pa;
 			itlb_store[itlb_slot].te_va = va;
 			index = itlb_slot_max - itlb_slot - 1;
 			if (itlb_enter_sun4u(index, data, va) < 0)
 				panic("%s: can't enter iTLB slot %d data "
 				    "%#lx va %#lxd", __func__, index, data,
 				    va);
 			itlb_slot++;
 			pa = (vm_offset_t)-1;
 		}
 		len -= len > PAGE_SIZE_4M ? PAGE_SIZE_4M : len;
 		va += PAGE_SIZE_4M;
 	}
 	if (pa != (vm_offset_t)-1)
 		release_phys(pa, PAGE_SIZE_4M);
 	return (0);
 }
 
 static vm_offset_t
 init_heap(void)
 {
 
 	/* There is no need for continuous physical heap memory. */
 	heapva = (vm_offset_t)OF_claim((void *)HEAPVA, HEAPSZ, 32);
 	return (heapva);
 }
 
 static phandle_t
 find_bsp_sun4u(phandle_t node, uint32_t bspid)
 {
 	char type[sizeof("cpu")];
 	phandle_t child;
 	uint32_t cpuid;
 
 	for (; node > 0; node = OF_peer(node)) {
 		child = OF_child(node);
 		if (child > 0) {
 			child = find_bsp_sun4u(child, bspid);
 			if (child > 0)
 				return (child);
 		} else {
 			if (OF_getprop(node, "device_type", type,
 			    sizeof(type)) <= 0)
 				continue;
 			if (strcmp(type, "cpu") != 0)
 				continue;
 			if (OF_getprop(node, cpu_cpuid_prop_sun4u(), &cpuid,
 			    sizeof(cpuid)) <= 0)
 				continue;
 			if (cpuid == bspid)
 				return (node);
 		}
 	}
 	return (0);
 }
 
 const char *
 cpu_cpuid_prop_sun4u(void)
 {
 
 	switch (cpu_impl) {
 	case CPU_IMPL_SPARC64:
 	case CPU_IMPL_SPARC64V:
 	case CPU_IMPL_ULTRASPARCI:
 	case CPU_IMPL_ULTRASPARCII:
 	case CPU_IMPL_ULTRASPARCIIi:
 	case CPU_IMPL_ULTRASPARCIIe:
 		return ("upa-portid");
 	case CPU_IMPL_ULTRASPARCIII:
 	case CPU_IMPL_ULTRASPARCIIIp:
 	case CPU_IMPL_ULTRASPARCIIIi:
 	case CPU_IMPL_ULTRASPARCIIIip:
 		return ("portid");
 	case CPU_IMPL_ULTRASPARCIV:
 	case CPU_IMPL_ULTRASPARCIVp:
 		return ("cpuid");
 	default:
 		return ("");
 	}
 }
 
 uint32_t
 cpu_get_mid_sun4u(void)
 {
 
 	switch (cpu_impl) {
 	case CPU_IMPL_SPARC64:
 	case CPU_IMPL_SPARC64V:
 	case CPU_IMPL_ULTRASPARCI:
 	case CPU_IMPL_ULTRASPARCII:
 	case CPU_IMPL_ULTRASPARCIIi:
 	case CPU_IMPL_ULTRASPARCIIe:
 		return (UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIII:
 	case CPU_IMPL_ULTRASPARCIIIp:
 		return (FIREPLANE_CR_GET_AID(ldxa(AA_FIREPLANE_CONFIG,
 		    ASI_FIREPLANE_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIIIi:
 	case CPU_IMPL_ULTRASPARCIIIip:
 		return (JBUS_CR_GET_JID(ldxa(0, ASI_JBUS_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIV:
 	case CPU_IMPL_ULTRASPARCIVp:
 		return (INTR_ID_GET_ID(ldxa(AA_INTR_ID, ASI_INTR_ID)));
 	default:
 		return (0);
 	}
 }
 
 static void
 tlb_init_sun4u(void)
 {
 	phandle_t bsp;
 
 	cpu_impl = VER_IMPL(rdpr(ver));
 	switch (cpu_impl) {
 	case CPU_IMPL_SPARC64:
 	case CPU_IMPL_ULTRASPARCI:
 	case CPU_IMPL_ULTRASPARCII:
 	case CPU_IMPL_ULTRASPARCIIi:
 	case CPU_IMPL_ULTRASPARCIIe:
 		tlb_locked = TLB_DAR_T32;
 		break;
 	case CPU_IMPL_ULTRASPARCIII:
 	case CPU_IMPL_ULTRASPARCIIIp:
 	case CPU_IMPL_ULTRASPARCIIIi:
 	case CPU_IMPL_ULTRASPARCIIIip:
 	case CPU_IMPL_ULTRASPARCIV:
 	case CPU_IMPL_ULTRASPARCIVp:
 		tlb_locked = TLB_DAR_T16;
 		break;
 	case CPU_IMPL_SPARC64V:
 		tlb_locked = TLB_DAR_FTLB;
 		break;
 	}
 	bsp = find_bsp_sun4u(OF_child(root), cpu_get_mid_sun4u());
 	if (bsp == 0)
 		panic("%s: no node for bootcpu?!?!", __func__);
 
 	if (OF_getprop(bsp, "#dtlb-entries", &dtlb_slot_max,
 	    sizeof(dtlb_slot_max)) == -1 ||
 	    OF_getprop(bsp, "#itlb-entries", &itlb_slot_max,
 	    sizeof(itlb_slot_max)) == -1)
 		panic("%s: can't get TLB slot max.", __func__);
 
 	if (cpu_impl == CPU_IMPL_ULTRASPARCIIIp) {
 #ifdef LOADER_DEBUG
 		printf("pre fixup:\n");
 		pmap_print_tlb_sun4u();
 #endif
 
 		/*
 		 * Relocate the locked entry in it16 slot 0 (if existent)
 		 * as part of working around Cheetah+ erratum 34.
 		 */
 		itlb_relocate_locked0_sun4u();
 
 #ifdef LOADER_DEBUG
 		printf("post fixup:\n");
 		pmap_print_tlb_sun4u();
 #endif
 	}
 
 	dtlb_store = malloc(dtlb_slot_max * sizeof(*dtlb_store));
 	itlb_store = malloc(itlb_slot_max * sizeof(*itlb_store));
 	if (dtlb_store == NULL || itlb_store == NULL)
 		panic("%s: can't allocate TLB store", __func__);
 }
 
 #ifdef LOADER_ZFS_SUPPORT
 static void
 sparc64_zfs_probe(void)
 {
 	struct vtoc8 vtoc;
 	char alias[64], devname[sizeof(alias) + sizeof(":x") - 1];
 	char type[sizeof("device_type")];
 	char *bdev, *dev, *odev;
 	uint64_t guid;
 	int fd, len, part;
 	phandle_t aliases, options;
 
 	/* Get the GUID of the ZFS pool on the boot device. */
 	guid = 0;
 	zfs_probe_dev(bootpath, &guid);
 
 	/*
 	 * Get the GUIDs of the ZFS pools on any additional disks listed in
 	 * the boot-device environment variable.
 	 */
 	if ((aliases = OF_finddevice("/aliases")) == -1)
 		goto out;
 	options = OF_finddevice("/options");
 	len = OF_getproplen(options, "boot-device");
 	if (len <= 0)
 		goto out;
 	bdev = odev = malloc(len + 1);
 	if (bdev == NULL)
 		goto out;
 	if (OF_getprop(options, "boot-device", bdev, len) <= 0)
 		goto out;
 	bdev[len] = '\0';
 	while ((dev = strsep(&bdev, " ")) != NULL) {
 		if (*dev == '\0')
 			continue;
 		strcpy(alias, dev);
 		(void)OF_getprop(aliases, dev, alias, sizeof(alias));
 		/*
 		 * Don't probe the boot disk twice.  Note that bootpath
 		 * includes the partition specifier.
 		 */
 		if (strncmp(alias, bootpath, strlen(alias)) == 0)
 			continue;
 		if (OF_getprop(OF_finddevice(alias), "device_type", type,
 		    sizeof(type)) == -1)
 			continue;
 		if (strcmp(type, "block") != 0)
 			continue;
 
 		/* Find freebsd-zfs slices in the VTOC. */
 		fd = open(alias, O_RDONLY);
 		if (fd == -1)
 			continue;
 		lseek(fd, 0, SEEK_SET);
 		if (read(fd, &vtoc, sizeof(vtoc)) != sizeof(vtoc)) {
 			close(fd);
 			continue;
 		}
 		close(fd);
 
 		for (part = 0; part < 8; part++) {
 			if (part == 2 || vtoc.part[part].tag !=
 			    VTOC_TAG_FREEBSD_ZFS)
 				continue;
 			(void)sprintf(devname, "%s:%c", alias, part + 'a');
 			if (zfs_probe_dev(devname, NULL) == ENXIO)
 				break;
 		}
 	}
 	free(odev);
 
  out:
 	if (guid != 0) {
 		zfs_currdev.pool_guid = guid;
 		zfs_currdev.root_guid = 0;
 		zfs_currdev.d_dev = &zfs_dev;
 		zfs_currdev.d_type = zfs_currdev.d_dev->dv_type;
 	}
 }
 #endif /* LOADER_ZFS_SUPPORT */
 
 int
 main(int (*openfirm)(void *))
 {
 	char compatible[32];
 	struct devsw **dp;
 
 	/*
 	 * Tell the Open Firmware functions where they find the OFW gate.
 	 */
 	OF_init(openfirm);
 
 	archsw.arch_getdev = ofw_getdev;
 	archsw.arch_copyin = sparc64_copyin;
 	archsw.arch_copyout = ofw_copyout;
 	archsw.arch_readin = sparc64_readin;
 	archsw.arch_autoload = sparc64_autoload;
 #ifdef LOADER_ZFS_SUPPORT
 	archsw.arch_zfs_probe = sparc64_zfs_probe;
 #endif
 
 	if (init_heap() == (vm_offset_t)-1)
 		OF_exit();
 	setheap((void *)heapva, (void *)(heapva + HEAPSZ));
 
 	/*
 	 * Probe for a console.
 	 */
 	cons_probe();
 
 	if ((root = OF_peer(0)) == -1)
 		panic("%s: can't get root phandle", __func__);
 	OF_getprop(root, "compatible", compatible, sizeof(compatible));
 	mmu_ops = &mmu_ops_sun4u;
 
 	mmu_ops->tlb_init();
 
 	/*
 	 * Set up the current device.
 	 */
 	OF_getprop(chosen, "bootpath", bootpath, sizeof(bootpath));
 
 	/*
 	 * Initialize devices.
 	 */
-	for (dp = devsw; *dp != 0; dp++)
+	for (dp = devsw; *dp != NULL; dp++)
 		if ((*dp)->dv_init != 0)
 			(*dp)->dv_init();
 
 #ifdef LOADER_ZFS_SUPPORT
 	if (zfs_currdev.pool_guid != 0) {
 		(void)strncpy(bootpath, zfs_fmtdev(&zfs_currdev),
 		    sizeof(bootpath) - 1);
 		bootpath[sizeof(bootpath) - 1] = '\0';
 	} else
 #endif
 
 	/*
 	 * Sun compatible bootable CD-ROMs have a disk label placed before
 	 * the ISO 9660 data, with the actual file system being in the first
 	 * partition, while the other partitions contain pseudo disk labels
 	 * with embedded boot blocks for different architectures, which may
 	 * be followed by UFS file systems.
 	 * The firmware will set the boot path to the partition it boots from
 	 * ('f' in the sun4u/sun4v case), but we want the kernel to be loaded
 	 * from the ISO 9660 file system ('a'), so the boot path needs to be
 	 * altered.
 	 */
 	if (bootpath[strlen(bootpath) - 2] == ':' &&
 	    bootpath[strlen(bootpath) - 1] == 'f')
 		bootpath[strlen(bootpath) - 1] = 'a';
 
 	env_setenv("currdev", EV_VOLATILE, bootpath,
 	    ofw_setcurrdev, env_nounset);
 	env_setenv("loaddev", EV_VOLATILE, bootpath,
 	    env_noset, env_nounset);
 
 	printf("\n%s", bootprog_info);
 	printf("bootpath=\"%s\"\n", bootpath);
 
 	/* Give control to the machine independent loader code. */
 	interact(NULL);
 	return (1);
 }
 
 COMMAND_SET(heap, "heap", "show heap usage", command_heap);
 
 static int
 command_heap(int argc, char *argv[])
 {
 
 	mallocstats();
 	printf("heap base at %p, top at %p, upper limit at %p\n", heapva,
 	    sbrk(0), heapva + HEAPSZ);
 	return(CMD_OK);
 }
 
 COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot);
 
 static int
 command_reboot(int argc, char *argv[])
 {
 	int i;
 
 	for (i = 0; devsw[i] != NULL; ++i)
 		if (devsw[i]->dv_cleanup != NULL)
 			(devsw[i]->dv_cleanup)();
 
 	printf("Rebooting...\n");
 	OF_exit();
 }
 
 /* provide this for panic, as it's not in the startup code */
 void
 exit(int code)
 {
 
 	OF_exit();
 }
 
 #ifdef LOADER_DEBUG
 static const char *const page_sizes[] = {
 	"  8k", " 64k", "512k", "  4m"
 };
 
 static void
 pmap_print_tte_sun4u(tte_t tag, tte_t tte)
 {
 
 	printf("%s %s ",
 	    page_sizes[(tte >> TD_SIZE_SHIFT) & TD_SIZE_MASK],
 	    tag & TD_G ? "G" : " ");
 	printf(tte & TD_W ? "W " : "  ");
 	printf(tte & TD_P ? "\e[33mP\e[0m " : "  ");
 	printf(tte & TD_E ? "E " : "  ");
 	printf(tte & TD_CV ? "CV " : "   ");
 	printf(tte & TD_CP ? "CP " : "   ");
 	printf(tte & TD_L ? "\e[32mL\e[0m " : "  ");
 	printf(tte & TD_IE ? "IE " : "   ");
 	printf(tte & TD_NFO ? "NFO " : "    ");
 	printf("pa=0x%lx va=0x%lx ctx=%ld\n",
 	    TD_PA(tte), TLB_TAR_VA(tag), TLB_TAR_CTX(tag));
 }
 
 static void
 pmap_print_tlb_sun4u(void)
 {
 	tte_t tag, tte;
 	u_long pstate;
 	int i;
 
 	pstate = rdpr(pstate);
 	for (i = 0; i < itlb_slot_max; i++) {
 		wrpr(pstate, pstate & ~PSTATE_IE, 0);
 		tte = itlb_get_data_sun4u(tlb_locked, i);
 		wrpr(pstate, pstate, 0);
 		if (!(tte & TD_V))
 			continue;
 		tag = ldxa(TLB_DAR_SLOT(tlb_locked, i),
 		    ASI_ITLB_TAG_READ_REG);
 		printf("iTLB-%2u: ", i);
 		pmap_print_tte_sun4u(tag, tte);
 	}
 	for (i = 0; i < dtlb_slot_max; i++) {
 		wrpr(pstate, pstate & ~PSTATE_IE, 0);
 		tte = dtlb_get_data_sun4u(tlb_locked, i);
 		wrpr(pstate, pstate, 0);
 		if (!(tte & TD_V))
 			continue;
 		tag = ldxa(TLB_DAR_SLOT(tlb_locked, i),
 		    ASI_DTLB_TAG_READ_REG);
 		printf("dTLB-%2u: ", i);
 		pmap_print_tte_sun4u(tag, tte);
 	}
 }
 #endif
Index: head/sys/boot/userboot/userboot/userboot_disk.c
===================================================================
--- head/sys/boot/userboot/userboot/userboot_disk.c	(revision 314067)
+++ head/sys/boot/userboot/userboot/userboot_disk.c	(revision 314068)
@@ -1,237 +1,237 @@
 /*-
  * Copyright (c) 2011 Google, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Userboot disk image handling.
  */
 
 #include <sys/disk.h>
 #include <stand.h>
 #include <stdarg.h>
 #include <bootstrap.h>
 
 #include "disk.h"
 #include "libuserboot.h"
 
 struct userdisk_info {
 	uint64_t	mediasize;
 	uint16_t	sectorsize;
 	int		ud_open;	/* reference counter */
 	void		*ud_bcache;	/* buffer cache data */
 };
 
 int userboot_disk_maxunit = 0;
 
 static int userdisk_maxunit = 0;
 static struct userdisk_info	*ud_info;
 
 static int	userdisk_init(void);
 static void	userdisk_cleanup(void);
 static int	userdisk_strategy(void *devdata, int flag, daddr_t dblk,
 		    size_t size, char *buf, size_t *rsize);
 static int	userdisk_realstrategy(void *devdata, int flag, daddr_t dblk,
 		    size_t size, char *buf, size_t *rsize);
 static int	userdisk_open(struct open_file *f, ...);
 static int	userdisk_close(struct open_file *f);
 static int	userdisk_ioctl(struct open_file *f, u_long cmd, void *data);
 static int	userdisk_print(int verbose);
 
 struct devsw userboot_disk = {
 	"disk",
 	DEVT_DISK,
 	userdisk_init,
 	userdisk_strategy,
 	userdisk_open,
 	userdisk_close,
 	userdisk_ioctl,
 	userdisk_print,
 	userdisk_cleanup
 };
 
 /*
  * Initialize userdisk_info structure for each disk.
  */
 static int
 userdisk_init(void)
 {
 	off_t mediasize;
 	u_int sectorsize;
 	int i;
 
 	userdisk_maxunit = userboot_disk_maxunit;
 	if (userdisk_maxunit > 0) {
 		ud_info = malloc(sizeof(*ud_info) * userdisk_maxunit);
 		if (ud_info == NULL)
 			return (ENOMEM);
 		for (i = 0; i < userdisk_maxunit; i++) {
 			if (CALLBACK(diskioctl, i, DIOCGSECTORSIZE,
-			    &sectorsize) != 0 || CALLBACK(diskioctl, i,
-			    DIOCGMEDIASIZE, &mediasize) != 0)
+			    &sectorsize) != NULL || CALLBACK(diskioctl, i,
+			    DIOCGMEDIASIZE, &mediasize) != NULL)
 				return (ENXIO);
 			ud_info[i].mediasize = mediasize;
 			ud_info[i].sectorsize = sectorsize;
 			ud_info[i].ud_open = 0;
 			ud_info[i].ud_bcache = NULL;
 		}
 	}
 	bcache_add_dev(userdisk_maxunit);
 	return(0);
 }
 
 static void
 userdisk_cleanup(void)
 {
 
 	if (userdisk_maxunit > 0)
 		free(ud_info);
 	disk_cleanup(&userboot_disk);
 }
 
 /*
  * Print information about disks
  */
 static int
 userdisk_print(int verbose)
 {
 	struct disk_devdesc dev;
 	char line[80];
 	int i, ret = 0;
 
 	if (userdisk_maxunit == 0)
 		return (0);
 
 	printf("%s devices:", userboot_disk.dv_name);
 	if ((ret = pager_output("\n")) != 0)
 		return (ret);
 
 	for (i = 0; i < userdisk_maxunit; i++) {
 		snprintf(line, sizeof(line),
 		    "    disk%d:   Guest drive image\n", i);
 		ret = pager_output(line);
 		if (ret != 0)
 			break;
 		dev.d_dev = &userboot_disk;
 		dev.d_unit = i;
 		dev.d_slice = -1;
 		dev.d_partition = -1;
 		if (disk_open(&dev, ud_info[i].mediasize,
 		    ud_info[i].sectorsize, 0) == 0) {
 			snprintf(line, sizeof(line), "    disk%d", i);
 			ret = disk_print(&dev, line, verbose);
 			disk_close(&dev);
 			if (ret != 0)
 				break;
 		}
 	}
 	return (ret);
 }
 
 /*
  * Attempt to open the disk described by (dev) for use by (f).
  */
 static int
 userdisk_open(struct open_file *f, ...)
 {
 	va_list			ap;
 	struct disk_devdesc	*dev;
 
 	va_start(ap, f);
 	dev = va_arg(ap, struct disk_devdesc *);
 	va_end(ap);
 
 	if (dev->d_unit < 0 || dev->d_unit >= userdisk_maxunit)
 		return (EIO);
 	ud_info[dev->d_unit].ud_open++;
 	if (ud_info[dev->d_unit].ud_bcache == NULL)
 		ud_info[dev->d_unit].ud_bcache = bcache_allocate();
 	return (disk_open(dev, ud_info[dev->d_unit].mediasize,
 	    ud_info[dev->d_unit].sectorsize, 0));
 }
 
 static int
 userdisk_close(struct open_file *f)
 {
 	struct disk_devdesc *dev;
 
 	dev = (struct disk_devdesc *)f->f_devdata;
 	ud_info[dev->d_unit].ud_open--;
 	if (ud_info[dev->d_unit].ud_open == 0) {
 		bcache_free(ud_info[dev->d_unit].ud_bcache);
 		ud_info[dev->d_unit].ud_bcache = NULL;
 	}
 	return (disk_close(dev));
 }
 
 static int
 userdisk_strategy(void *devdata, int rw, daddr_t dblk, size_t size,
     char *buf, size_t *rsize)
 {
 	struct bcache_devdata bcd;
 	struct disk_devdesc *dev;
 
 	dev = (struct disk_devdesc *)devdata;
 	bcd.dv_strategy = userdisk_realstrategy;
 	bcd.dv_devdata = devdata;
 	bcd.dv_cache = ud_info[dev->d_unit].ud_bcache;
 	return (bcache_strategy(&bcd, rw, dblk + dev->d_offset,
 	    size, buf, rsize));
 }
 
 static int
 userdisk_realstrategy(void *devdata, int rw, daddr_t dblk, size_t size,
     char *buf, size_t *rsize)
 {
 	struct disk_devdesc *dev = devdata;
 	uint64_t	off;
 	size_t		resid;
 	int		rc;
 
 	if (rw == F_WRITE)
 		return (EROFS);
 	if (rw != F_READ)
 		return (EINVAL);
 	if (rsize)
 		*rsize = 0;
 	off = dblk * ud_info[dev->d_unit].sectorsize;
 	rc = CALLBACK(diskread, dev->d_unit, off, buf, size, &resid);
 	if (rc)
 		return (rc);
 	if (rsize)
 		*rsize = size - resid;
 	return (0);
 }
 
 static int
 userdisk_ioctl(struct open_file *f, u_long cmd, void *data)
 {
 	struct disk_devdesc *dev;
 
 	dev = (struct disk_devdesc *)f->f_devdata;
 	return (CALLBACK(diskioctl, dev->d_unit, cmd, data));
 }
Index: head/sys/boot/zfs/zfs.c
===================================================================
--- head/sys/boot/zfs/zfs.c	(revision 314067)
+++ head/sys/boot/zfs/zfs.c	(revision 314068)
@@ -1,912 +1,912 @@
 /*-
  * Copyright (c) 2007 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Stand-alone file reading package.
  */
 
 #include <sys/disk.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/queue.h>
 #include <part.h>
 #include <stddef.h>
 #include <stdarg.h>
 #include <string.h>
 #include <stand.h>
 #include <bootstrap.h>
 
 #include "libzfs.h"
 
 #include "zfsimpl.c"
 
 /* Define the range of indexes to be populated with ZFS Boot Environments */
 #define		ZFS_BE_FIRST	4
 #define		ZFS_BE_LAST	8
 
 static int	zfs_open(const char *path, struct open_file *f);
 static int	zfs_write(struct open_file *f, void *buf, size_t size, size_t *resid);
 static int	zfs_close(struct open_file *f);
 static int	zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
 static off_t	zfs_seek(struct open_file *f, off_t offset, int where);
 static int	zfs_stat(struct open_file *f, struct stat *sb);
 static int	zfs_readdir(struct open_file *f, struct dirent *d);
 
 struct devsw zfs_dev;
 
 struct fs_ops zfs_fsops = {
 	"zfs",
 	zfs_open,
 	zfs_close,
 	zfs_read,
 	zfs_write,
 	zfs_seek,
 	zfs_stat,
 	zfs_readdir
 };
 
 /*
  * In-core open file.
  */
 struct file {
 	off_t		f_seekp;	/* seek pointer */
 	dnode_phys_t	f_dnode;
 	uint64_t	f_zap_type;	/* zap type for readdir */
 	uint64_t	f_num_leafs;	/* number of fzap leaf blocks */
 	zap_leaf_phys_t	*f_zap_leaf;	/* zap leaf buffer */
 };
 
 static int	zfs_env_index;
 static int	zfs_env_count;
 
 SLIST_HEAD(zfs_be_list, zfs_be_entry) zfs_be_head = SLIST_HEAD_INITIALIZER(zfs_be_head);
 struct zfs_be_list *zfs_be_headp;
 struct zfs_be_entry {
 	const char *name;
 	SLIST_ENTRY(zfs_be_entry) entries;
 } *zfs_be, *zfs_be_tmp;
 
 /*
  * Open a file.
  */
 static int
 zfs_open(const char *upath, struct open_file *f)
 {
 	struct zfsmount *mount = (struct zfsmount *)f->f_devdata;
 	struct file *fp;
 	int rc;
 
 	if (f->f_dev != &zfs_dev)
 		return (EINVAL);
 
 	/* allocate file system specific data structure */
 	fp = malloc(sizeof(struct file));
 	bzero(fp, sizeof(struct file));
 	f->f_fsdata = (void *)fp;
 
 	rc = zfs_lookup(mount, upath, &fp->f_dnode);
 	fp->f_seekp = 0;
 	if (rc) {
 		f->f_fsdata = NULL;
 		free(fp);
 	}
 	return (rc);
 }
 
 static int
 zfs_close(struct open_file *f)
 {
 	struct file *fp = (struct file *)f->f_fsdata;
 
-	dnode_cache_obj = 0;
+	dnode_cache_obj = NULL;
 	f->f_fsdata = (void *)0;
 	if (fp == (struct file *)0)
 		return (0);
 
 	free(fp);
 	return (0);
 }
 
 /*
  * Copy a portion of a file into kernel memory.
  * Cross block boundaries when necessary.
  */
 static int
 zfs_read(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
 {
 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
 	struct file *fp = (struct file *)f->f_fsdata;
 	struct stat sb;
 	size_t n;
 	int rc;
 
 	rc = zfs_stat(f, &sb);
 	if (rc)
 		return (rc);
 	n = size;
 	if (fp->f_seekp + n > sb.st_size)
 		n = sb.st_size - fp->f_seekp;
 
 	rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
 	if (rc)
 		return (rc);
 
 	if (0) {
 	    int i;
 	    for (i = 0; i < n; i++)
 		putchar(((char*) start)[i]);
 	}
 	fp->f_seekp += n;
 	if (resid)
 		*resid = size - n;
 
 	return (0);
 }
 
 /*
  * Don't be silly - the bootstrap has no business writing anything.
  */
 static int
 zfs_write(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
 {
 
 	return (EROFS);
 }
 
 static off_t
 zfs_seek(struct open_file *f, off_t offset, int where)
 {
 	struct file *fp = (struct file *)f->f_fsdata;
 
 	switch (where) {
 	case SEEK_SET:
 		fp->f_seekp = offset;
 		break;
 	case SEEK_CUR:
 		fp->f_seekp += offset;
 		break;
 	case SEEK_END:
 	    {
 		struct stat sb;
 		int error;
 
 		error = zfs_stat(f, &sb);
 		if (error != 0) {
 			errno = error;
 			return (-1);
 		}
 		fp->f_seekp = sb.st_size - offset;
 		break;
 	    }
 	default:
 		errno = EINVAL;
 		return (-1);
 	}
 	return (fp->f_seekp);
 }
 
 static int
 zfs_stat(struct open_file *f, struct stat *sb)
 {
 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
 	struct file *fp = (struct file *)f->f_fsdata;
 
 	return (zfs_dnode_stat(spa, &fp->f_dnode, sb));
 }
 
 static int
 zfs_readdir(struct open_file *f, struct dirent *d)
 {
 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
 	struct file *fp = (struct file *)f->f_fsdata;
 	mzap_ent_phys_t mze;
 	struct stat sb;
 	size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	int rc;
 
 	rc = zfs_stat(f, &sb);
 	if (rc)
 		return (rc);
 	if (!S_ISDIR(sb.st_mode))
 		return (ENOTDIR);
 
 	/*
 	 * If this is the first read, get the zap type.
 	 */
 	if (fp->f_seekp == 0) {
 		rc = dnode_read(spa, &fp->f_dnode,
 				0, &fp->f_zap_type, sizeof(fp->f_zap_type));
 		if (rc)
 			return (rc);
 
 		if (fp->f_zap_type == ZBT_MICRO) {
 			fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
 		} else {
 			rc = dnode_read(spa, &fp->f_dnode,
 					offsetof(zap_phys_t, zap_num_leafs),
 					&fp->f_num_leafs,
 					sizeof(fp->f_num_leafs));
 			if (rc)
 				return (rc);
 
 			fp->f_seekp = bsize;
 			fp->f_zap_leaf = (zap_leaf_phys_t *)malloc(bsize);
 			rc = dnode_read(spa, &fp->f_dnode,
 					fp->f_seekp,
 					fp->f_zap_leaf,
 					bsize);
 			if (rc)
 				return (rc);
 		}
 	}
 
 	if (fp->f_zap_type == ZBT_MICRO) {
 	mzap_next:
 		if (fp->f_seekp >= bsize)
 			return (ENOENT);
 
 		rc = dnode_read(spa, &fp->f_dnode,
 				fp->f_seekp, &mze, sizeof(mze));
 		if (rc)
 			return (rc);
 		fp->f_seekp += sizeof(mze);
 
 		if (!mze.mze_name[0])
 			goto mzap_next;
 
 		d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
 		d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
 		strcpy(d->d_name, mze.mze_name);
 		d->d_namlen = strlen(d->d_name);
 		return (0);
 	} else {
 		zap_leaf_t zl;
 		zap_leaf_chunk_t *zc, *nc;
 		int chunk;
 		size_t namelen;
 		char *p;
 		uint64_t value;
 
 		/*
 		 * Initialise this so we can use the ZAP size
 		 * calculating macros.
 		 */
 		zl.l_bs = ilog2(bsize);
 		zl.l_phys = fp->f_zap_leaf;
 
 		/*
 		 * Figure out which chunk we are currently looking at
 		 * and consider seeking to the next leaf. We use the
 		 * low bits of f_seekp as a simple chunk index.
 		 */
 	fzap_next:
 		chunk = fp->f_seekp & (bsize - 1);
 		if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
 			fp->f_seekp = rounddown2(fp->f_seekp, bsize) + bsize;
 			chunk = 0;
 
 			/*
 			 * Check for EOF and read the new leaf.
 			 */
 			if (fp->f_seekp >= bsize * fp->f_num_leafs)
 				return (ENOENT);
 
 			rc = dnode_read(spa, &fp->f_dnode,
 					fp->f_seekp,
 					fp->f_zap_leaf,
 					bsize);
 			if (rc)
 				return (rc);
 		}
 
 		zc = &ZAP_LEAF_CHUNK(&zl, chunk);
 		fp->f_seekp++;
 		if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
 			goto fzap_next;
 
 		namelen = zc->l_entry.le_name_numints;
 		if (namelen > sizeof(d->d_name))
 			namelen = sizeof(d->d_name);
 
 		/*
 		 * Paste the name back together.
 		 */
 		nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
 		p = d->d_name;
 		while (namelen > 0) {
 			int len;
 			len = namelen;
 			if (len > ZAP_LEAF_ARRAY_BYTES)
 				len = ZAP_LEAF_ARRAY_BYTES;
 			memcpy(p, nc->l_array.la_array, len);
 			p += len;
 			namelen -= len;
 			nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
 		}
 		d->d_name[sizeof(d->d_name) - 1] = 0;
 
 		/*
 		 * Assume the first eight bytes of the value are
 		 * a uint64_t.
 		 */
 		value = fzap_leaf_value(&zl, zc);
 
 		d->d_fileno = ZFS_DIRENT_OBJ(value);
 		d->d_type = ZFS_DIRENT_TYPE(value);
 		d->d_namlen = strlen(d->d_name);
 
 		return (0);
 	}
 }
 
 static int
 vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
 {
 	int fd;
 
 	fd = (uintptr_t) priv;
 	lseek(fd, offset, SEEK_SET);
 	if (read(fd, buf, size) == size) {
 		return 0;
 	} else {
 		return (EIO);
 	}
 }
 
 static int
 zfs_dev_init(void)
 {
 	spa_t *spa;
 	spa_t *next;
 	spa_t *prev;
 
 	zfs_init();
 	if (archsw.arch_zfs_probe == NULL)
 		return (ENXIO);
 	archsw.arch_zfs_probe();
 
 	prev = NULL;
 	spa = STAILQ_FIRST(&zfs_pools);
 	while (spa != NULL) {
 		next = STAILQ_NEXT(spa, spa_link);
 		if (zfs_spa_init(spa)) {
 			if (prev == NULL)
 				STAILQ_REMOVE_HEAD(&zfs_pools, spa_link);
 			else
 				STAILQ_REMOVE_AFTER(&zfs_pools, prev, spa_link);
 		} else
 			prev = spa;
 		spa = next;
 	}
 	return (0);
 }
 
 struct zfs_probe_args {
 	int		fd;
 	const char	*devname;
 	uint64_t	*pool_guid;
 	u_int		secsz;
 };
 
 static int
 zfs_diskread(void *arg, void *buf, size_t blocks, uint64_t offset)
 {
 	struct zfs_probe_args *ppa;
 
 	ppa = (struct zfs_probe_args *)arg;
 	return (vdev_read(NULL, (void *)(uintptr_t)ppa->fd,
 	    offset * ppa->secsz, buf, blocks * ppa->secsz));
 }
 
 static int
 zfs_probe(int fd, uint64_t *pool_guid)
 {
 	spa_t *spa;
 	int ret;
 
 	ret = vdev_probe(vdev_read, (void *)(uintptr_t)fd, &spa);
 	if (ret == 0 && pool_guid != NULL)
 		*pool_guid = spa->spa_guid;
 	return (ret);
 }
 
 static int
 zfs_probe_partition(void *arg, const char *partname,
     const struct ptable_entry *part)
 {
 	struct zfs_probe_args *ppa, pa;
 	struct ptable *table;
 	char devname[32];
 	int ret;
 
 	/* Probe only freebsd-zfs and freebsd partitions */
 	if (part->type != PART_FREEBSD &&
 	    part->type != PART_FREEBSD_ZFS)
 		return (0);
 
 	ppa = (struct zfs_probe_args *)arg;
 	strncpy(devname, ppa->devname, strlen(ppa->devname) - 1);
 	devname[strlen(ppa->devname) - 1] = '\0';
 	sprintf(devname, "%s%s:", devname, partname);
 	pa.fd = open(devname, O_RDONLY);
 	if (pa.fd == -1)
 		return (0);
 	ret = zfs_probe(pa.fd, ppa->pool_guid);
 	if (ret == 0)
 		return (0);
 	/* Do we have BSD label here? */
 	if (part->type == PART_FREEBSD) {
 		pa.devname = devname;
 		pa.pool_guid = ppa->pool_guid;
 		pa.secsz = ppa->secsz;
 		table = ptable_open(&pa, part->end - part->start + 1,
 		    ppa->secsz, zfs_diskread);
 		if (table != NULL) {
 			ptable_iterate(table, &pa, zfs_probe_partition);
 			ptable_close(table);
 		}
 	}
 	close(pa.fd);
 	return (0);
 }
 
 int
 zfs_probe_dev(const char *devname, uint64_t *pool_guid)
 {
 	struct ptable *table;
 	struct zfs_probe_args pa;
 	uint64_t mediasz;
 	int ret;
 
 	if (pool_guid)
 		*pool_guid = 0;
 	pa.fd = open(devname, O_RDONLY);
 	if (pa.fd == -1)
 		return (ENXIO);
 	/* Probe the whole disk */
 	ret = zfs_probe(pa.fd, pool_guid);
 	if (ret == 0)
 		return (0);
 
 	/* Probe each partition */
 	ret = ioctl(pa.fd, DIOCGMEDIASIZE, &mediasz);
 	if (ret == 0)
 		ret = ioctl(pa.fd, DIOCGSECTORSIZE, &pa.secsz);
 	if (ret == 0) {
 		pa.devname = devname;
 		pa.pool_guid = pool_guid;
 		table = ptable_open(&pa, mediasz / pa.secsz, pa.secsz,
 		    zfs_diskread);
 		if (table != NULL) {
 			ptable_iterate(table, &pa, zfs_probe_partition);
 			ptable_close(table);
 		}
 	}
 	close(pa.fd);
 	if (pool_guid && *pool_guid == 0)
 		ret = ENXIO;
 	return (ret);
 }
 
 /*
  * Print information about ZFS pools
  */
 static int
 zfs_dev_print(int verbose)
 {
 	spa_t *spa;
 	char line[80];
 	int ret = 0;
 
 	if (STAILQ_EMPTY(&zfs_pools))
 		return (0);
 
 	printf("%s devices:", zfs_dev.dv_name);
 	if ((ret = pager_output("\n")) != 0)
 		return (ret);
 
 	if (verbose) {
 		return (spa_all_status());
 	}
 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
 		snprintf(line, sizeof(line), "    zfs:%s\n", spa->spa_name);
 		ret = pager_output(line);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 /*
  * Attempt to open the pool described by (dev) for use by (f).
  */
 static int
 zfs_dev_open(struct open_file *f, ...)
 {
 	va_list		args;
 	struct zfs_devdesc	*dev;
 	struct zfsmount	*mount;
 	spa_t		*spa;
 	int		rv;
 
 	va_start(args, f);
 	dev = va_arg(args, struct zfs_devdesc *);
 	va_end(args);
 
 	if (dev->pool_guid == 0)
 		spa = STAILQ_FIRST(&zfs_pools);
 	else
 		spa = spa_find_by_guid(dev->pool_guid);
 	if (!spa)
 		return (ENXIO);
 	mount = malloc(sizeof(*mount));
 	rv = zfs_mount(spa, dev->root_guid, mount);
 	if (rv != 0) {
 		free(mount);
 		return (rv);
 	}
 	if (mount->objset.os_type != DMU_OST_ZFS) {
 		printf("Unexpected object set type %ju\n",
 		    (uintmax_t)mount->objset.os_type);
 		free(mount);
 		return (EIO);
 	}
 	f->f_devdata = mount;
 	free(dev);
 	return (0);
 }
 
 static int
 zfs_dev_close(struct open_file *f)
 {
 
 	free(f->f_devdata);
 	f->f_devdata = NULL;
 	return (0);
 }
 
 static int
 zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
 {
 
 	return (ENOSYS);
 }
 
 struct devsw zfs_dev = {
 	.dv_name = "zfs",
 	.dv_type = DEVT_ZFS,
 	.dv_init = zfs_dev_init,
 	.dv_strategy = zfs_dev_strategy,
 	.dv_open = zfs_dev_open,
 	.dv_close = zfs_dev_close,
 	.dv_ioctl = noioctl,
 	.dv_print = zfs_dev_print,
 	.dv_cleanup = NULL
 };
 
 int
 zfs_parsedev(struct zfs_devdesc *dev, const char *devspec, const char **path)
 {
 	static char	rootname[ZFS_MAXNAMELEN];
 	static char	poolname[ZFS_MAXNAMELEN];
 	spa_t		*spa;
 	const char	*end;
 	const char	*np;
 	const char	*sep;
 	int		rv;
 
 	np = devspec;
 	if (*np != ':')
 		return (EINVAL);
 	np++;
 	end = strchr(np, ':');
 	if (end == NULL)
 		return (EINVAL);
 	sep = strchr(np, '/');
 	if (sep == NULL || sep >= end)
 		sep = end;
 	memcpy(poolname, np, sep - np);
 	poolname[sep - np] = '\0';
 	if (sep < end) {
 		sep++;
 		memcpy(rootname, sep, end - sep);
 		rootname[end - sep] = '\0';
 	}
 	else
 		rootname[0] = '\0';
 
 	spa = spa_find_by_name(poolname);
 	if (!spa)
 		return (ENXIO);
 	dev->pool_guid = spa->spa_guid;
 	rv = zfs_lookup_dataset(spa, rootname, &dev->root_guid);
 	if (rv != 0)
 		return (rv);
 	if (path != NULL)
 		*path = (*end == '\0') ? end : end + 1;
 	dev->d_dev = &zfs_dev;
 	dev->d_type = zfs_dev.dv_type;
 	return (0);
 }
 
 char *
 zfs_fmtdev(void *vdev)
 {
 	static char		rootname[ZFS_MAXNAMELEN];
 	static char		buf[2 * ZFS_MAXNAMELEN + 8];
 	struct zfs_devdesc	*dev = (struct zfs_devdesc *)vdev;
 	spa_t			*spa;
 
 	buf[0] = '\0';
 	if (dev->d_type != DEVT_ZFS)
 		return (buf);
 
 	if (dev->pool_guid == 0) {
 		spa = STAILQ_FIRST(&zfs_pools);
 		dev->pool_guid = spa->spa_guid;
 	} else
 		spa = spa_find_by_guid(dev->pool_guid);
 	if (spa == NULL) {
 		printf("ZFS: can't find pool by guid\n");
 		return (buf);
 	}
 	if (dev->root_guid == 0 && zfs_get_root(spa, &dev->root_guid)) {
 		printf("ZFS: can't find root filesystem\n");
 		return (buf);
 	}
 	if (zfs_rlookup(spa, dev->root_guid, rootname)) {
 		printf("ZFS: can't find filesystem by guid\n");
 		return (buf);
 	}
 
 	if (rootname[0] == '\0')
 		sprintf(buf, "%s:%s:", dev->d_dev->dv_name, spa->spa_name);
 	else
 		sprintf(buf, "%s:%s/%s:", dev->d_dev->dv_name, spa->spa_name,
 		    rootname);
 	return (buf);
 }
 
 int
 zfs_list(const char *name)
 {
 	static char	poolname[ZFS_MAXNAMELEN];
 	uint64_t	objid;
 	spa_t		*spa;
 	const char	*dsname;
 	int		len;
 	int		rv;
 
 	len = strlen(name);
 	dsname = strchr(name, '/');
 	if (dsname != NULL) {
 		len = dsname - name;
 		dsname++;
 	} else
 		dsname = "";
 	memcpy(poolname, name, len);
 	poolname[len] = '\0';
 
 	spa = spa_find_by_name(poolname);
 	if (!spa)
 		return (ENXIO);
 	rv = zfs_lookup_dataset(spa, dsname, &objid);
 	if (rv != 0)
 		return (rv);
 
 	return (zfs_list_dataset(spa, objid));
 }
 
 void
 init_zfs_bootenv(char *currdev)
 {
 	char *beroot;
 
 	if (strlen(currdev) == 0)
 		return;
 	if(strncmp(currdev, "zfs:", 4) != 0)
 		return;
 	/* Remove the trailing : */
 	currdev[strlen(currdev) - 1] = '\0';
 	setenv("zfs_be_active", currdev, 1);
 	setenv("zfs_be_currpage", "1", 1);
 	/* Forward past zfs: */
 	currdev = strchr(currdev, ':');
 	currdev++;
 	/* Remove the last element (current bootenv) */
 	beroot = strrchr(currdev, '/');
 	if (beroot != NULL)
 		beroot[0] = '\0';
 	beroot = currdev;
 	setenv("zfs_be_root", beroot, 1);
 }
 
 int
 zfs_bootenv(const char *name)
 {
 	static char	poolname[ZFS_MAXNAMELEN], *dsname, *root;
 	char		becount[4];
 	uint64_t	objid;
 	spa_t		*spa;
 	int		len, rv, pages, perpage, currpage;
 
 	if (name == NULL)
 		return (EINVAL);
 	if ((root = getenv("zfs_be_root")) == NULL)
 		return (EINVAL);
 
 	if (strcmp(name, root) != 0) {
 		if (setenv("zfs_be_root", name, 1) != 0)
 			return (ENOMEM);
 	}
 
 	SLIST_INIT(&zfs_be_head);
 	zfs_env_count = 0;
 	len = strlen(name);
 	dsname = strchr(name, '/');
 	if (dsname != NULL) {
 		len = dsname - name;
 		dsname++;
 	} else
 		dsname = "";
 	memcpy(poolname, name, len);
 	poolname[len] = '\0';
 
 	spa = spa_find_by_name(poolname);
 	if (!spa)
 		return (ENXIO);
 	rv = zfs_lookup_dataset(spa, dsname, &objid);
 	if (rv != 0)
 		return (rv);
 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
 
 	/* Calculate and store the number of pages of BEs */
 	perpage = (ZFS_BE_LAST - ZFS_BE_FIRST + 1);
 	pages = (zfs_env_count / perpage) + ((zfs_env_count % perpage) > 0 ? 1 : 0);
 	snprintf(becount, 4, "%d", pages);
 	if (setenv("zfs_be_pages", becount, 1) != 0)
 		return (ENOMEM);
 
 	/* Roll over the page counter if it has exceeded the maximum */
 	currpage = strtol(getenv("zfs_be_currpage"), NULL, 10);
 	if (currpage > pages) {
 		if (setenv("zfs_be_currpage", "1", 1) != 0)
 			return (ENOMEM);
 	}
 
 	/* Populate the menu environment variables */
 	zfs_set_env();
 
 	/* Clean up the SLIST of ZFS BEs */
 	while (!SLIST_EMPTY(&zfs_be_head)) {
 		zfs_be = SLIST_FIRST(&zfs_be_head);
 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
 		free(zfs_be);
 	}
 
 	return (rv);
 }
 
 int
 zfs_belist_add(const char *name, uint64_t value __unused)
 {
 
 	/* Skip special datasets that start with a $ character */
 	if (strncmp(name, "$", 1) == 0) {
 		return (0);
 	}
 	/* Add the boot environment to the head of the SLIST */
 	zfs_be = malloc(sizeof(struct zfs_be_entry));
 	if (zfs_be == NULL) {
 		return (ENOMEM);
 	}
 	zfs_be->name = name;
 	SLIST_INSERT_HEAD(&zfs_be_head, zfs_be, entries);
 	zfs_env_count++;
 
 	return (0);
 }
 
 int
 zfs_set_env(void)
 {
 	char envname[32], envval[256];
 	char *beroot, *pagenum;
 	int rv, page, ctr;
 
 	beroot = getenv("zfs_be_root");
 	if (beroot == NULL) {
 		return (1);
 	}
 
 	pagenum = getenv("zfs_be_currpage");
 	if (pagenum != NULL) {
 		page = strtol(pagenum, NULL, 10);
 	} else {
 		page = 1;
 	}
 
 	ctr = 1;
 	rv = 0;
 	zfs_env_index = ZFS_BE_FIRST;
 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
 		/* Skip to the requested page number */
 		if (ctr <= ((ZFS_BE_LAST - ZFS_BE_FIRST + 1) * (page - 1))) {
 			ctr++;
 			continue;
 		}
 		
 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
 		snprintf(envval, sizeof(envval), "%s", zfs_be->name);
 		rv = setenv(envname, envval, 1);
 		if (rv != 0) {
 			break;
 		}
 
 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
 		rv = setenv(envname, envval, 1);
 		if (rv != 0){
 			break;
 		}
 
 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
 		rv = setenv(envname, "set_bootenv", 1);
 		if (rv != 0){
 			break;
 		}
 
 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
 		snprintf(envval, sizeof(envval), "zfs:%s/%s", beroot, zfs_be->name);
 		rv = setenv(envname, envval, 1);
 		if (rv != 0){
 			break;
 		}
 
 		zfs_env_index++;
 		if (zfs_env_index > ZFS_BE_LAST) {
 			break;
 		}
 
 	}
 	
 	for (; zfs_env_index <= ZFS_BE_LAST; zfs_env_index++) {
 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
 		(void)unsetenv(envname);
 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
 		(void)unsetenv(envname);
 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
 		(void)unsetenv(envname);
 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
 		(void)unsetenv(envname);
 	}
 
 	return (rv);
 }
Index: head/sys/boot/zfs/zfsimpl.c
===================================================================
--- head/sys/boot/zfs/zfsimpl.c	(revision 314067)
+++ head/sys/boot/zfs/zfsimpl.c	(revision 314068)
@@ -1,2388 +1,2388 @@
 /*-
  * Copyright (c) 2007 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Stand-alone ZFS file reader.
  */
 
 #include <sys/stat.h>
 #include <sys/stdint.h>
 
 #include "zfsimpl.h"
 #include "zfssubr.c"
 
 
 struct zfsmount {
 	const spa_t	*spa;
 	objset_phys_t	objset;
 	uint64_t	rootobj;
 };
 
 /*
  * List of all vdevs, chained through v_alllink.
  */
 static vdev_list_t zfs_vdevs;
 
  /*
  * List of ZFS features supported for read
  */
 static const char *features_for_read[] = {
 	"org.illumos:lz4_compress",
 	"com.delphix:hole_birth",
 	"com.delphix:extensible_dataset",
 	"com.delphix:embedded_data",
 	"org.open-zfs:large_blocks",
 	"org.illumos:sha512",
 	"org.illumos:skein",
 	NULL
 };
 
 /*
  * List of all pools, chained through spa_link.
  */
 static spa_list_t zfs_pools;
 
 static uint64_t zfs_crc64_table[256];
-static const dnode_phys_t *dnode_cache_obj = 0;
+static const dnode_phys_t *dnode_cache_obj = NULL;
 static uint64_t dnode_cache_bn;
 static char *dnode_cache_buf;
 static char *zap_scratch;
 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
 
 #define TEMP_SIZE	(1024 * 1024)
 
 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
     const char *name, uint64_t integer_size, uint64_t num_integers,
     void *value);
 
 static void
 zfs_init(void)
 {
 	STAILQ_INIT(&zfs_vdevs);
 	STAILQ_INIT(&zfs_pools);
 
 	zfs_temp_buf = malloc(TEMP_SIZE);
 	zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
 	zfs_temp_ptr = zfs_temp_buf;
 	dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
 	zap_scratch = malloc(SPA_MAXBLOCKSIZE);
 
 	zfs_init_crc();
 }
 
 static void *
 zfs_alloc(size_t size)
 {
 	char *ptr;
 
 	if (zfs_temp_ptr + size > zfs_temp_end) {
 		printf("ZFS: out of temporary buffer space\n");
 		for (;;) ;
 	}
 	ptr = zfs_temp_ptr;
 	zfs_temp_ptr += size;
 
 	return (ptr);
 }
 
 static void
 zfs_free(void *ptr, size_t size)
 {
 
 	zfs_temp_ptr -= size;
 	if (zfs_temp_ptr != ptr) {
 		printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
 		for (;;) ;
 	}
 }
 
 static int
 xdr_int(const unsigned char **xdr, int *ip)
 {
 	*ip = ((*xdr)[0] << 24)
 		| ((*xdr)[1] << 16)
 		| ((*xdr)[2] << 8)
 		| ((*xdr)[3] << 0);
 	(*xdr) += 4;
 	return (0);
 }
 
 static int
 xdr_u_int(const unsigned char **xdr, u_int *ip)
 {
 	*ip = ((*xdr)[0] << 24)
 		| ((*xdr)[1] << 16)
 		| ((*xdr)[2] << 8)
 		| ((*xdr)[3] << 0);
 	(*xdr) += 4;
 	return (0);
 }
 
 static int
 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
 {
 	u_int hi, lo;
 
 	xdr_u_int(xdr, &hi);
 	xdr_u_int(xdr, &lo);
 	*lp = (((uint64_t) hi) << 32) | lo;
 	return (0);
 }
 
 static int
 nvlist_find(const unsigned char *nvlist, const char *name, int type,
 	    int* elementsp, void *valuep)
 {
 	const unsigned char *p, *pair;
 	int junk;
 	int encoded_size, decoded_size;
 
 	p = nvlist;
 	xdr_int(&p, &junk);
 	xdr_int(&p, &junk);
 
 	pair = p;
 	xdr_int(&p, &encoded_size);
 	xdr_int(&p, &decoded_size);
 	while (encoded_size && decoded_size) {
 		int namelen, pairtype, elements;
 		const char *pairname;
 
 		xdr_int(&p, &namelen);
 		pairname = (const char*) p;
 		p += roundup(namelen, 4);
 		xdr_int(&p, &pairtype);
 
 		if (!memcmp(name, pairname, namelen) && type == pairtype) {
 			xdr_int(&p, &elements);
 			if (elementsp)
 				*elementsp = elements;
 			if (type == DATA_TYPE_UINT64) {
 				xdr_uint64_t(&p, (uint64_t *) valuep);
 				return (0);
 			} else if (type == DATA_TYPE_STRING) {
 				int len;
 				xdr_int(&p, &len);
 				(*(const char**) valuep) = (const char*) p;
 				return (0);
 			} else if (type == DATA_TYPE_NVLIST
 				   || type == DATA_TYPE_NVLIST_ARRAY) {
 				(*(const unsigned char**) valuep) =
 					 (const unsigned char*) p;
 				return (0);
 			} else {
 				return (EIO);
 			}
 		} else {
 			/*
 			 * Not the pair we are looking for, skip to the next one.
 			 */
 			p = pair + encoded_size;
 		}
 
 		pair = p;
 		xdr_int(&p, &encoded_size);
 		xdr_int(&p, &decoded_size);
 	}
 
 	return (EIO);
 }
 
 static int
 nvlist_check_features_for_read(const unsigned char *nvlist)
 {
 	const unsigned char *p, *pair;
 	int junk;
 	int encoded_size, decoded_size;
 	int rc;
 
 	rc = 0;
 
 	p = nvlist;
 	xdr_int(&p, &junk);
 	xdr_int(&p, &junk);
 
 	pair = p;
 	xdr_int(&p, &encoded_size);
 	xdr_int(&p, &decoded_size);
 	while (encoded_size && decoded_size) {
 		int namelen, pairtype;
 		const char *pairname;
 		int i, found;
 
 		found = 0;
 
 		xdr_int(&p, &namelen);
 		pairname = (const char*) p;
 		p += roundup(namelen, 4);
 		xdr_int(&p, &pairtype);
 
 		for (i = 0; features_for_read[i] != NULL; i++) {
 			if (!memcmp(pairname, features_for_read[i], namelen)) {
 				found = 1;
 				break;
 			}
 		}
 
 		if (!found) {
 			printf("ZFS: unsupported feature: %s\n", pairname);
 			rc = EIO;
 		}
 
 		p = pair + encoded_size;
 
 		pair = p;
 		xdr_int(&p, &encoded_size);
 		xdr_int(&p, &decoded_size);
 	}
 
 	return (rc);
 }
 
 /*
  * Return the next nvlist in an nvlist array.
  */
 static const unsigned char *
 nvlist_next(const unsigned char *nvlist)
 {
 	const unsigned char *p, *pair;
 	int junk;
 	int encoded_size, decoded_size;
 
 	p = nvlist;
 	xdr_int(&p, &junk);
 	xdr_int(&p, &junk);
 
 	pair = p;
 	xdr_int(&p, &encoded_size);
 	xdr_int(&p, &decoded_size);
 	while (encoded_size && decoded_size) {
 		p = pair + encoded_size;
 
 		pair = p;
 		xdr_int(&p, &encoded_size);
 		xdr_int(&p, &decoded_size);
 	}
 
 	return p;
 }
 
 #ifdef TEST
 
 static const unsigned char *
 nvlist_print(const unsigned char *nvlist, unsigned int indent)
 {
 	static const char* typenames[] = {
 		"DATA_TYPE_UNKNOWN",
 		"DATA_TYPE_BOOLEAN",
 		"DATA_TYPE_BYTE",
 		"DATA_TYPE_INT16",
 		"DATA_TYPE_UINT16",
 		"DATA_TYPE_INT32",
 		"DATA_TYPE_UINT32",
 		"DATA_TYPE_INT64",
 		"DATA_TYPE_UINT64",
 		"DATA_TYPE_STRING",
 		"DATA_TYPE_BYTE_ARRAY",
 		"DATA_TYPE_INT16_ARRAY",
 		"DATA_TYPE_UINT16_ARRAY",
 		"DATA_TYPE_INT32_ARRAY",
 		"DATA_TYPE_UINT32_ARRAY",
 		"DATA_TYPE_INT64_ARRAY",
 		"DATA_TYPE_UINT64_ARRAY",
 		"DATA_TYPE_STRING_ARRAY",
 		"DATA_TYPE_HRTIME",
 		"DATA_TYPE_NVLIST",
 		"DATA_TYPE_NVLIST_ARRAY",
 		"DATA_TYPE_BOOLEAN_VALUE",
 		"DATA_TYPE_INT8",
 		"DATA_TYPE_UINT8",
 		"DATA_TYPE_BOOLEAN_ARRAY",
 		"DATA_TYPE_INT8_ARRAY",
 		"DATA_TYPE_UINT8_ARRAY"
 	};
 
 	unsigned int i, j;
 	const unsigned char *p, *pair;
 	int junk;
 	int encoded_size, decoded_size;
 
 	p = nvlist;
 	xdr_int(&p, &junk);
 	xdr_int(&p, &junk);
 
 	pair = p;
 	xdr_int(&p, &encoded_size);
 	xdr_int(&p, &decoded_size);
 	while (encoded_size && decoded_size) {
 		int namelen, pairtype, elements;
 		const char *pairname;
 
 		xdr_int(&p, &namelen);
 		pairname = (const char*) p;
 		p += roundup(namelen, 4);
 		xdr_int(&p, &pairtype);
 
 		for (i = 0; i < indent; i++)
 			printf(" ");
 		printf("%s %s", typenames[pairtype], pairname);
 
 		xdr_int(&p, &elements);
 		switch (pairtype) {
 		case DATA_TYPE_UINT64: {
 			uint64_t val;
 			xdr_uint64_t(&p, &val);
 			printf(" = 0x%jx\n", (uintmax_t)val);
 			break;
 		}
 
 		case DATA_TYPE_STRING: {
 			int len;
 			xdr_int(&p, &len);
 			printf(" = \"%s\"\n", p);
 			break;
 		}
 
 		case DATA_TYPE_NVLIST:
 			printf("\n");
 			nvlist_print(p, indent + 1);
 			break;
 
 		case DATA_TYPE_NVLIST_ARRAY:
 			for (j = 0; j < elements; j++) {
 				printf("[%d]\n", j);
 				p = nvlist_print(p, indent + 1);
 				if (j != elements - 1) {
 					for (i = 0; i < indent; i++)
 						printf(" ");
 					printf("%s %s", typenames[pairtype], pairname);
 				}
 			}
 			break;
 
 		default:
 			printf("\n");
 		}
 
 		p = pair + encoded_size;
 
 		pair = p;
 		xdr_int(&p, &encoded_size);
 		xdr_int(&p, &decoded_size);
 	}
 
 	return p;
 }
 
 #endif
 
 static int
 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
     off_t offset, size_t size)
 {
 	size_t psize;
 	int rc;
 
 	if (!vdev->v_phys_read)
 		return (EIO);
 
 	if (bp) {
 		psize = BP_GET_PSIZE(bp);
 	} else {
 		psize = size;
 	}
 
 	/*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
 	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
 	if (rc)
 		return (rc);
 	if (bp && zio_checksum_verify(vdev->spa, bp, buf))
 		return (EIO);
 
 	return (0);
 }
 
 static int
 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
     off_t offset, size_t bytes)
 {
 
 	return (vdev_read_phys(vdev, bp, buf,
 		offset + VDEV_LABEL_START_SIZE, bytes));
 }
 
 
 static int
 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
     off_t offset, size_t bytes)
 {
 	vdev_t *kid;
 	int rc;
 
 	rc = EIO;
 	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
 		if (kid->v_state != VDEV_STATE_HEALTHY)
 			continue;
 		rc = kid->v_read(kid, bp, buf, offset, bytes);
 		if (!rc)
 			return (0);
 	}
 
 	return (rc);
 }
 
 static int
 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
     off_t offset, size_t bytes)
 {
 	vdev_t *kid;
 
 	/*
 	 * Here we should have two kids:
 	 * First one which is the one we are replacing and we can trust
 	 * only this one to have valid data, but it might not be present.
 	 * Second one is that one we are replacing with. It is most likely
 	 * healthy, but we can't trust it has needed data, so we won't use it.
 	 */
 	kid = STAILQ_FIRST(&vdev->v_children);
 	if (kid == NULL)
 		return (EIO);
 	if (kid->v_state != VDEV_STATE_HEALTHY)
 		return (EIO);
 	return (kid->v_read(kid, bp, buf, offset, bytes));
 }
 
 static vdev_t *
 vdev_find(uint64_t guid)
 {
 	vdev_t *vdev;
 
 	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
 		if (vdev->v_guid == guid)
 			return (vdev);
 
 	return (0);
 }
 
 static vdev_t *
 vdev_create(uint64_t guid, vdev_read_t *read)
 {
 	vdev_t *vdev;
 
 	vdev = malloc(sizeof(vdev_t));
 	memset(vdev, 0, sizeof(vdev_t));
 	STAILQ_INIT(&vdev->v_children);
 	vdev->v_guid = guid;
 	vdev->v_state = VDEV_STATE_OFFLINE;
 	vdev->v_read = read;
 	vdev->v_phys_read = 0;
 	vdev->v_read_priv = 0;
 	STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
 
 	return (vdev);
 }
 
 static int
 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
     vdev_t **vdevp, int is_newer)
 {
 	int rc;
 	uint64_t guid, id, ashift, nparity;
 	const char *type;
 	const char *path;
 	vdev_t *vdev, *kid;
 	const unsigned char *kids;
 	int nkids, i, is_new;
 	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
 
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
 			DATA_TYPE_UINT64, 0, &guid)
 	    || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
 			   DATA_TYPE_UINT64, 0, &id)
 	    || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
 			   DATA_TYPE_STRING, 0, &type)) {
 		printf("ZFS: can't find vdev details\n");
 		return (ENOENT);
 	}
 
 	if (strcmp(type, VDEV_TYPE_MIRROR)
 	    && strcmp(type, VDEV_TYPE_DISK)
 #ifdef ZFS_TEST
 	    && strcmp(type, VDEV_TYPE_FILE)
 #endif
 	    && strcmp(type, VDEV_TYPE_RAIDZ)
 	    && strcmp(type, VDEV_TYPE_REPLACING)) {
 		printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
 		return (EIO);
 	}
 
 	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
 
 	nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
 			&is_offline);
 	nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
 			&is_removed);
 	nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
 			&is_faulted);
 	nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
 			&is_degraded);
 	nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
 			&isnt_present);
 
 	vdev = vdev_find(guid);
 	if (!vdev) {
 		is_new = 1;
 
 		if (!strcmp(type, VDEV_TYPE_MIRROR))
 			vdev = vdev_create(guid, vdev_mirror_read);
 		else if (!strcmp(type, VDEV_TYPE_RAIDZ))
 			vdev = vdev_create(guid, vdev_raidz_read);
 		else if (!strcmp(type, VDEV_TYPE_REPLACING))
 			vdev = vdev_create(guid, vdev_replacing_read);
 		else
 			vdev = vdev_create(guid, vdev_disk_read);
 
 		vdev->v_id = id;
 		vdev->v_top = pvdev != NULL ? pvdev : vdev;
 		if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
 			DATA_TYPE_UINT64, 0, &ashift) == 0)
 			vdev->v_ashift = ashift;
 		else
 			vdev->v_ashift = 0;
 		if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
 			DATA_TYPE_UINT64, 0, &nparity) == 0)
 			vdev->v_nparity = nparity;
 		else
 			vdev->v_nparity = 0;
 		if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
 				DATA_TYPE_STRING, 0, &path) == 0) {
 			if (strncmp(path, "/dev/", 5) == 0)
 				path += 5;
 			vdev->v_name = strdup(path);
 		} else {
 			if (!strcmp(type, "raidz")) {
 				if (vdev->v_nparity == 1)
 					vdev->v_name = "raidz1";
 				else if (vdev->v_nparity == 2)
 					vdev->v_name = "raidz2";
 				else if (vdev->v_nparity == 3)
 					vdev->v_name = "raidz3";
 				else {
 					printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
 					return (EIO);
 				}
 			} else {
 				vdev->v_name = strdup(type);
 			}
 		}
 	} else {
 		is_new = 0;
 	}
 
 	if (is_new || is_newer) {
 		/*
 		 * This is either new vdev or we've already seen this vdev,
 		 * but from an older vdev label, so let's refresh its state
 		 * from the newer label.
 		 */
 		if (is_offline)
 			vdev->v_state = VDEV_STATE_OFFLINE;
 		else if (is_removed)
 			vdev->v_state = VDEV_STATE_REMOVED;
 		else if (is_faulted)
 			vdev->v_state = VDEV_STATE_FAULTED;
 		else if (is_degraded)
 			vdev->v_state = VDEV_STATE_DEGRADED;
 		else if (isnt_present)
 			vdev->v_state = VDEV_STATE_CANT_OPEN;
 	}
 
 	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
 			 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
 	/*
 	 * Its ok if we don't have any kids.
 	 */
 	if (rc == 0) {
 		vdev->v_nchildren = nkids;
 		for (i = 0; i < nkids; i++) {
 			rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
 			if (rc)
 				return (rc);
 			if (is_new)
 				STAILQ_INSERT_TAIL(&vdev->v_children, kid,
 						   v_childlink);
 			kids = nvlist_next(kids);
 		}
 	} else {
 		vdev->v_nchildren = 0;
 	}
 
 	if (vdevp)
 		*vdevp = vdev;
 	return (0);
 }
 
 static void
 vdev_set_state(vdev_t *vdev)
 {
 	vdev_t *kid;
 	int good_kids;
 	int bad_kids;
 
 	/*
 	 * A mirror or raidz is healthy if all its kids are healthy. A
 	 * mirror is degraded if any of its kids is healthy; a raidz
 	 * is degraded if at most nparity kids are offline.
 	 */
 	if (STAILQ_FIRST(&vdev->v_children)) {
 		good_kids = 0;
 		bad_kids = 0;
 		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
 			if (kid->v_state == VDEV_STATE_HEALTHY)
 				good_kids++;
 			else
 				bad_kids++;
 		}
 		if (bad_kids == 0) {
 			vdev->v_state = VDEV_STATE_HEALTHY;
 		} else {
 			if (vdev->v_read == vdev_mirror_read) {
 				if (good_kids) {
 					vdev->v_state = VDEV_STATE_DEGRADED;
 				} else {
 					vdev->v_state = VDEV_STATE_OFFLINE;
 				}
 			} else if (vdev->v_read == vdev_raidz_read) {
 				if (bad_kids > vdev->v_nparity) {
 					vdev->v_state = VDEV_STATE_OFFLINE;
 				} else {
 					vdev->v_state = VDEV_STATE_DEGRADED;
 				}
 			}
 		}
 	}
 }
 
 static spa_t *
 spa_find_by_guid(uint64_t guid)
 {
 	spa_t *spa;
 
 	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
 		if (spa->spa_guid == guid)
 			return (spa);
 
 	return (0);
 }
 
 static spa_t *
 spa_find_by_name(const char *name)
 {
 	spa_t *spa;
 
 	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
 		if (!strcmp(spa->spa_name, name))
 			return (spa);
 
 	return (0);
 }
 
 #ifdef BOOT2
 static spa_t *
 spa_get_primary(void)
 {
 
 	return (STAILQ_FIRST(&zfs_pools));
 }
 
 static vdev_t *
 spa_get_primary_vdev(const spa_t *spa)
 {
 	vdev_t *vdev;
 	vdev_t *kid;
 
 	if (spa == NULL)
 		spa = spa_get_primary();
 	if (spa == NULL)
 		return (NULL);
 	vdev = STAILQ_FIRST(&spa->spa_vdevs);
 	if (vdev == NULL)
 		return (NULL);
 	for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
 	     kid = STAILQ_FIRST(&vdev->v_children))
 		vdev = kid;
 	return (vdev);
 }
 #endif
 
 static spa_t *
 spa_create(uint64_t guid)
 {
 	spa_t *spa;
 
 	spa = malloc(sizeof(spa_t));
 	memset(spa, 0, sizeof(spa_t));
 	STAILQ_INIT(&spa->spa_vdevs);
 	spa->spa_guid = guid;
 	STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
 
 	return (spa);
 }
 
 static const char *
 state_name(vdev_state_t state)
 {
 	static const char* names[] = {
 		"UNKNOWN",
 		"CLOSED",
 		"OFFLINE",
 		"REMOVED",
 		"CANT_OPEN",
 		"FAULTED",
 		"DEGRADED",
 		"ONLINE"
 	};
 	return names[state];
 }
 
 #ifdef BOOT2
 
 #define pager_printf printf
 
 #else
 
 static int
 pager_printf(const char *fmt, ...)
 {
 	char line[80];
 	va_list args;
 
 	va_start(args, fmt);
 	vsprintf(line, fmt, args);
 	va_end(args);
 	return (pager_output(line));
 }
 
 #endif
 
 #define STATUS_FORMAT	"        %s %s\n"
 
 static int
 print_state(int indent, const char *name, vdev_state_t state)
 {
 	int i;
 	char buf[512];
 
 	buf[0] = 0;
 	for (i = 0; i < indent; i++)
 		strcat(buf, "  ");
 	strcat(buf, name);
 	return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
 	
 }
 
 static int
 vdev_status(vdev_t *vdev, int indent)
 {
 	vdev_t *kid;
 	int ret;
 	ret = print_state(indent, vdev->v_name, vdev->v_state);
 	if (ret != 0)
 		return (ret);
 
 	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
 		ret = vdev_status(kid, indent + 1);
 		if (ret != 0)
 			return (ret);
 	}
 	return (ret);
 }
 
 static int
 spa_status(spa_t *spa)
 {
 	static char bootfs[ZFS_MAXNAMELEN];
 	uint64_t rootid;
 	vdev_t *vdev;
 	int good_kids, bad_kids, degraded_kids, ret;
 	vdev_state_t state;
 
 	ret = pager_printf("  pool: %s\n", spa->spa_name);
 	if (ret != 0)
 		return (ret);
 
 	if (zfs_get_root(spa, &rootid) == 0 &&
 	    zfs_rlookup(spa, rootid, bootfs) == 0) {
 		if (bootfs[0] == '\0')
 			ret = pager_printf("bootfs: %s\n", spa->spa_name);
 		else
 			ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
 			    bootfs);
 		if (ret != 0)
 			return (ret);
 	}
 	ret = pager_printf("config:\n\n");
 	if (ret != 0)
 		return (ret);
 	ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
 	if (ret != 0)
 		return (ret);
 
 	good_kids = 0;
 	degraded_kids = 0;
 	bad_kids = 0;
 	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
 		if (vdev->v_state == VDEV_STATE_HEALTHY)
 			good_kids++;
 		else if (vdev->v_state == VDEV_STATE_DEGRADED)
 			degraded_kids++;
 		else
 			bad_kids++;
 	}
 
 	state = VDEV_STATE_CLOSED;
 	if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
 		state = VDEV_STATE_HEALTHY;
 	else if ((good_kids + degraded_kids) > 0)
 		state = VDEV_STATE_DEGRADED;
 
 	ret = print_state(0, spa->spa_name, state);
 	if (ret != 0)
 		return (ret);
 	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
 		ret = vdev_status(vdev, 1);
 		if (ret != 0)
 			return (ret);
 	}
 	return (ret);
 }
 
 static int
 spa_all_status(void)
 {
 	spa_t *spa;
 	int first = 1, ret = 0;
 
 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
 		if (!first) {
 			ret = pager_printf("\n");
 			if (ret != 0)
 				return (ret);
 		}
 		first = 0;
 		ret = spa_status(spa);
 		if (ret != 0)
 			return (ret);
 	}
 	return (ret);
 }
 
 static int
 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
 {
 	vdev_t vtmp;
 	vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
 	spa_t *spa;
 	vdev_t *vdev, *top_vdev, *pool_vdev;
 	off_t off;
 	blkptr_t bp;
 	const unsigned char *nvlist;
 	uint64_t val;
 	uint64_t guid;
 	uint64_t pool_txg, pool_guid;
 	uint64_t is_log;
 	const char *pool_name;
 	const unsigned char *vdevs;
 	const unsigned char *features;
 	int i, rc, is_newer;
 	char *upbuf;
 	const struct uberblock *up;
 
 	/*
 	 * Load the vdev label and figure out which
 	 * uberblock is most current.
 	 */
 	memset(&vtmp, 0, sizeof(vtmp));
 	vtmp.v_phys_read = read;
 	vtmp.v_read_priv = read_priv;
 	off = offsetof(vdev_label_t, vl_vdev_phys);
 	BP_ZERO(&bp);
 	BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
 	BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
 	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
 	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
 	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
 	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
 	if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
 		return (EIO);
 
 	if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
 		return (EIO);
 	}
 
 	nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
 
 	if (nvlist_find(nvlist,
 			ZPOOL_CONFIG_VERSION,
 			DATA_TYPE_UINT64, 0, &val)) {
 		return (EIO);
 	}
 
 	if (!SPA_VERSION_IS_SUPPORTED(val)) {
 		printf("ZFS: unsupported ZFS version %u (should be %u)\n",
 		    (unsigned) val, (unsigned) SPA_VERSION);
 		return (EIO);
 	}
 
 	/* Check ZFS features for read */
 	if (nvlist_find(nvlist,
 			ZPOOL_CONFIG_FEATURES_FOR_READ,
 			DATA_TYPE_NVLIST, 0, &features) == 0
 	    && nvlist_check_features_for_read(features) != 0)
 		return (EIO);
 
 	if (nvlist_find(nvlist,
 			ZPOOL_CONFIG_POOL_STATE,
 			DATA_TYPE_UINT64, 0, &val)) {
 		return (EIO);
 	}
 
 	if (val == POOL_STATE_DESTROYED) {
 		/* We don't boot only from destroyed pools. */
 		return (EIO);
 	}
 
 	if (nvlist_find(nvlist,
 			ZPOOL_CONFIG_POOL_TXG,
 			DATA_TYPE_UINT64, 0, &pool_txg)
 	    || nvlist_find(nvlist,
 			   ZPOOL_CONFIG_POOL_GUID,
 			   DATA_TYPE_UINT64, 0, &pool_guid)
 	    || nvlist_find(nvlist,
 			   ZPOOL_CONFIG_POOL_NAME,
 			   DATA_TYPE_STRING, 0, &pool_name)) {
 		/*
 		 * Cache and spare devices end up here - just ignore
 		 * them.
 		 */
 		/*printf("ZFS: can't find pool details\n");*/
 		return (EIO);
 	}
 
 	is_log = 0;
 	(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
 	    &is_log);
 	if (is_log)
 		return (EIO);
 
 	/*
 	 * Create the pool if this is the first time we've seen it.
 	 */
 	spa = spa_find_by_guid(pool_guid);
 	if (!spa) {
 		spa = spa_create(pool_guid);
 		spa->spa_name = strdup(pool_name);
 	}
 	if (pool_txg > spa->spa_txg) {
 		spa->spa_txg = pool_txg;
 		is_newer = 1;
 	} else
 		is_newer = 0;
 
 	/*
 	 * Get the vdev tree and create our in-core copy of it.
 	 * If we already have a vdev with this guid, this must
 	 * be some kind of alias (overlapping slices, dangerously dedicated
 	 * disks etc).
 	 */
 	if (nvlist_find(nvlist,
 			ZPOOL_CONFIG_GUID,
 			DATA_TYPE_UINT64, 0, &guid)) {
 		return (EIO);
 	}
 	vdev = vdev_find(guid);
 	if (vdev && vdev->v_phys_read)	/* Has this vdev already been inited? */
 		return (EIO);
 
 	if (nvlist_find(nvlist,
 			ZPOOL_CONFIG_VDEV_TREE,
 			DATA_TYPE_NVLIST, 0, &vdevs)) {
 		return (EIO);
 	}
 
 	rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
 	if (rc)
 		return (rc);
 
 	/*
 	 * Add the toplevel vdev to the pool if its not already there.
 	 */
 	STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
 		if (top_vdev == pool_vdev)
 			break;
 	if (!pool_vdev && top_vdev) {
 		top_vdev->spa = spa;
 		STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
 	}
 
 	/*
 	 * We should already have created an incomplete vdev for this
 	 * vdev. Find it and initialise it with our read proc.
 	 */
 	vdev = vdev_find(guid);
 	if (vdev) {
 		vdev->v_phys_read = read;
 		vdev->v_read_priv = read_priv;
 		vdev->v_state = VDEV_STATE_HEALTHY;
 	} else {
 		printf("ZFS: inconsistent nvlist contents\n");
 		return (EIO);
 	}
 
 	/*
 	 * Re-evaluate top-level vdev state.
 	 */
 	vdev_set_state(top_vdev);
 
 	/*
 	 * Ok, we are happy with the pool so far. Lets find
 	 * the best uberblock and then we can actually access
 	 * the contents of the pool.
 	 */
 	upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
 	up = (const struct uberblock *)upbuf;
 	for (i = 0;
 	     i < VDEV_UBERBLOCK_COUNT(vdev);
 	     i++) {
 		off = VDEV_UBERBLOCK_OFFSET(vdev, i);
 		BP_ZERO(&bp);
 		DVA_SET_OFFSET(&bp.blk_dva[0], off);
 		BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
 		BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
 		BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
 		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
 		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
 
 		if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
 			continue;
 
 		if (up->ub_magic != UBERBLOCK_MAGIC)
 			continue;
 		if (up->ub_txg < spa->spa_txg)
 			continue;
 		if (up->ub_txg > spa->spa_uberblock.ub_txg) {
 			spa->spa_uberblock = *up;
 		} else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
 			if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
 				spa->spa_uberblock = *up;
 		}
 	}
 	zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
 
 	vdev->spa = spa;
 	if (spap)
 		*spap = spa;
 	return (0);
 }
 
 static int
 ilog2(int n)
 {
 	int v;
 
 	for (v = 0; v < 32; v++)
 		if (n == (1 << v))
 			return v;
 	return -1;
 }
 
 static int
 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
 {
 	blkptr_t gbh_bp;
 	zio_gbh_phys_t zio_gb;
 	char *pbuf;
 	int i;
 
 	/* Artificial BP for gang block header. */
 	gbh_bp = *bp;
 	BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
 	BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
 	BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
 	BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
 	for (i = 0; i < SPA_DVAS_PER_BP; i++)
 		DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
 
 	/* Read gang header block using the artificial BP. */
 	if (zio_read(spa, &gbh_bp, &zio_gb))
 		return (EIO);
 
 	pbuf = buf;
 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
 		blkptr_t *gbp = &zio_gb.zg_blkptr[i];
 
 		if (BP_IS_HOLE(gbp))
 			continue;
 		if (zio_read(spa, gbp, pbuf))
 			return (EIO);
 		pbuf += BP_GET_PSIZE(gbp);
 	}
 
 	if (zio_checksum_verify(spa, bp, buf))
 		return (EIO);
 	return (0);
 }
 
 static int
 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
 {
 	int cpfunc = BP_GET_COMPRESS(bp);
 	uint64_t align, size;
 	void *pbuf;
 	int i, error;
 
 	/*
 	 * Process data embedded in block pointer
 	 */
 	if (BP_IS_EMBEDDED(bp)) {
 		ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 
 		size = BPE_GET_PSIZE(bp);
 		ASSERT(size <= BPE_PAYLOAD_SIZE);
 
 		if (cpfunc != ZIO_COMPRESS_OFF)
 			pbuf = zfs_alloc(size);
 		else
 			pbuf = buf;
 
 		decode_embedded_bp_compressed(bp, pbuf);
 		error = 0;
 
 		if (cpfunc != ZIO_COMPRESS_OFF) {
 			error = zio_decompress_data(cpfunc, pbuf,
 			    size, buf, BP_GET_LSIZE(bp));
 			zfs_free(pbuf, size);
 		}
 		if (error != 0)
 			printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
 			    error);
 		return (error);
 	}
 
 	error = EIO;
 
 	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
 		const dva_t *dva = &bp->blk_dva[i];
 		vdev_t *vdev;
 		int vdevid;
 		off_t offset;
 
 		if (!dva->dva_word[0] && !dva->dva_word[1])
 			continue;
 
 		vdevid = DVA_GET_VDEV(dva);
 		offset = DVA_GET_OFFSET(dva);
 		STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
 			if (vdev->v_id == vdevid)
 				break;
 		}
 		if (!vdev || !vdev->v_read)
 			continue;
 
 		size = BP_GET_PSIZE(bp);
 		if (vdev->v_read == vdev_raidz_read) {
 			align = 1ULL << vdev->v_top->v_ashift;
 			if (P2PHASE(size, align) != 0)
 				size = P2ROUNDUP(size, align);
 		}
 		if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
 			pbuf = zfs_alloc(size);
 		else
 			pbuf = buf;
 
 		if (DVA_GET_GANG(dva))
 			error = zio_read_gang(spa, bp, pbuf);
 		else
 			error = vdev->v_read(vdev, bp, pbuf, offset, size);
 		if (error == 0) {
 			if (cpfunc != ZIO_COMPRESS_OFF)
 				error = zio_decompress_data(cpfunc, pbuf,
 				    BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
 			else if (size != BP_GET_PSIZE(bp))
 				bcopy(pbuf, buf, BP_GET_PSIZE(bp));
 		}
 		if (buf != pbuf)
 			zfs_free(pbuf, size);
 		if (error == 0)
 			break;
 	}
 	if (error != 0)
 		printf("ZFS: i/o error - all block copies unavailable\n");
 	return (error);
 }
 
 static int
 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
 {
 	int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
 	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	int nlevels = dnode->dn_nlevels;
 	int i, rc;
 
 	if (bsize > SPA_MAXBLOCKSIZE) {
 		printf("ZFS: I/O error - blocks larger than %llu are not "
 		    "supported\n", SPA_MAXBLOCKSIZE);
 		return (EIO);
 	}
 
 	/*
 	 * Note: bsize may not be a power of two here so we need to do an
 	 * actual divide rather than a bitshift.
 	 */
 	while (buflen > 0) {
 		uint64_t bn = offset / bsize;
 		int boff = offset % bsize;
 		int ibn;
 		const blkptr_t *indbp;
 		blkptr_t bp;
 
 		if (bn > dnode->dn_maxblkid)
 			return (EIO);
 
 		if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
 			goto cached;
 
 		indbp = dnode->dn_blkptr;
 		for (i = 0; i < nlevels; i++) {
 			/*
 			 * Copy the bp from the indirect array so that
 			 * we can re-use the scratch buffer for multi-level
 			 * objects.
 			 */
 			ibn = bn >> ((nlevels - i - 1) * ibshift);
 			ibn &= ((1 << ibshift) - 1);
 			bp = indbp[ibn];
 			if (BP_IS_HOLE(&bp)) {
 				memset(dnode_cache_buf, 0, bsize);
 				break;
 			}
 			rc = zio_read(spa, &bp, dnode_cache_buf);
 			if (rc)
 				return (rc);
 			indbp = (const blkptr_t *) dnode_cache_buf;
 		}
 		dnode_cache_obj = dnode;
 		dnode_cache_bn = bn;
 	cached:
 
 		/*
 		 * The buffer contains our data block. Copy what we
 		 * need from it and loop.
 		 */ 
 		i = bsize - boff;
 		if (i > buflen) i = buflen;
 		memcpy(buf, &dnode_cache_buf[boff], i);
 		buf = ((char*) buf) + i;
 		offset += i;
 		buflen -= i;
 	}
 
 	return (0);
 }
 
 /*
  * Lookup a value in a microzap directory. Assumes that the zap
  * scratch buffer contains the directory contents.
  */
 static int
 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
 {
 	const mzap_phys_t *mz;
 	const mzap_ent_phys_t *mze;
 	size_t size;
 	int chunks, i;
 
 	/*
 	 * Microzap objects use exactly one block. Read the whole
 	 * thing.
 	 */
 	size = dnode->dn_datablkszsec * 512;
 
 	mz = (const mzap_phys_t *) zap_scratch;
 	chunks = size / MZAP_ENT_LEN - 1;
 
 	for (i = 0; i < chunks; i++) {
 		mze = &mz->mz_chunk[i];
 		if (!strcmp(mze->mze_name, name)) {
 			*value = mze->mze_value;
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 /*
  * Compare a name with a zap leaf entry. Return non-zero if the name
  * matches.
  */
 static int
 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
 {
 	size_t namelen;
 	const zap_leaf_chunk_t *nc;
 	const char *p;
 
 	namelen = zc->l_entry.le_name_numints;
 			
 	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
 	p = name;
 	while (namelen > 0) {
 		size_t len;
 		len = namelen;
 		if (len > ZAP_LEAF_ARRAY_BYTES)
 			len = ZAP_LEAF_ARRAY_BYTES;
 		if (memcmp(p, nc->l_array.la_array, len))
 			return (0);
 		p += len;
 		namelen -= len;
 		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
 	}
 
 	return 1;
 }
 
 /*
  * Extract a uint64_t value from a zap leaf entry.
  */
 static uint64_t
 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
 {
 	const zap_leaf_chunk_t *vc;
 	int i;
 	uint64_t value;
 	const uint8_t *p;
 
 	vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
 	for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
 		value = (value << 8) | p[i];
 	}
 
 	return value;
 }
 
 static void
 stv(int len, void *addr, uint64_t value)
 {
 	switch (len) {
 	case 1:
 		*(uint8_t *)addr = value;
 		return;
 	case 2:
 		*(uint16_t *)addr = value;
 		return;
 	case 4:
 		*(uint32_t *)addr = value;
 		return;
 	case 8:
 		*(uint64_t *)addr = value;
 		return;
 	}
 }
 
 /*
  * Extract a array from a zap leaf entry.
  */
 static void
 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
     uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	uint64_t array_int_len = zc->l_entry.le_value_intlen;
 	uint64_t value = 0;
 	uint64_t *u64 = buf;
 	char *p = buf;
 	int len = MIN(zc->l_entry.le_value_numints, num_integers);
 	int chunk = zc->l_entry.le_value_chunk;
 	int byten = 0;
 
 	if (integer_size == 8 && len == 1) {
 		*u64 = fzap_leaf_value(zl, zc);
 		return;
 	}
 
 	while (len > 0) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
 		int i;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
 			value = (value << 8) | la->la_array[i];
 			byten++;
 			if (byten == array_int_len) {
 				stv(integer_size, p, value);
 				byten = 0;
 				len--;
 				if (len == 0)
 					return;
 				p += integer_size;
 			}
 		}
 		chunk = la->la_next;
 	}
 }
 
 /*
  * Lookup a value in a fatzap directory. Assumes that the zap scratch
  * buffer contains the directory header.
  */
 static int
 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *value)
 {
 	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 	fat_zap_t z;
 	uint64_t *ptrtbl;
 	uint64_t hash;
 	int rc;
 
 	if (zh.zap_magic != ZAP_MAGIC)
 		return (EIO);
 
 	z.zap_block_shift = ilog2(bsize);
 	z.zap_phys = (zap_phys_t *) zap_scratch;
 
 	/*
 	 * Figure out where the pointer table is and read it in if necessary.
 	 */
 	if (zh.zap_ptrtbl.zt_blk) {
 		rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
 			       zap_scratch, bsize);
 		if (rc)
 			return (rc);
 		ptrtbl = (uint64_t *) zap_scratch;
 	} else {
 		ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
 	}
 
 	hash = zap_hash(zh.zap_salt, name);
 
 	zap_leaf_t zl;
 	zl.l_bs = z.zap_block_shift;
 
 	off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
 	zap_leaf_chunk_t *zc;
 
 	rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
 	if (rc)
 		return (rc);
 
 	zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 
 	/*
 	 * Make sure this chunk matches our hash.
 	 */
 	if (zl.l_phys->l_hdr.lh_prefix_len > 0
 	    && zl.l_phys->l_hdr.lh_prefix
 	    != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
 		return (ENOENT);
 
 	/*
 	 * Hash within the chunk to find our entry.
 	 */
 	int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
 	int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
 	h = zl.l_phys->l_hash[h];
 	if (h == 0xffff)
 		return (ENOENT);
 	zc = &ZAP_LEAF_CHUNK(&zl, h);
 	while (zc->l_entry.le_hash != hash) {
 		if (zc->l_entry.le_next == 0xffff) {
-			zc = 0;
+			zc = NULL;
 			break;
 		}
 		zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
 	}
 	if (fzap_name_equal(&zl, zc, name)) {
 		if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints >
 		    integer_size * num_integers)
 			return (E2BIG);
 		fzap_leaf_array(&zl, zc, integer_size, num_integers, value);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 /*
  * Lookup a name in a zap object and return its value as a uint64_t.
  */
 static int
 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *value)
 {
 	int rc;
 	uint64_t zap_type;
 	size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 
 	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
 	if (rc)
 		return (rc);
 
 	zap_type = *(uint64_t *) zap_scratch;
 	if (zap_type == ZBT_MICRO)
 		return mzap_lookup(dnode, name, value);
 	else if (zap_type == ZBT_HEADER) {
 		return fzap_lookup(spa, dnode, name, integer_size,
 		    num_integers, value);
 	}
 	printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
 	return (EIO);
 }
 
 /*
  * List a microzap directory. Assumes that the zap scratch buffer contains
  * the directory contents.
  */
 static int
 mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
 {
 	const mzap_phys_t *mz;
 	const mzap_ent_phys_t *mze;
 	size_t size;
 	int chunks, i, rc;
 
 	/*
 	 * Microzap objects use exactly one block. Read the whole
 	 * thing.
 	 */
 	size = dnode->dn_datablkszsec * 512;
 	mz = (const mzap_phys_t *) zap_scratch;
 	chunks = size / MZAP_ENT_LEN - 1;
 
 	for (i = 0; i < chunks; i++) {
 		mze = &mz->mz_chunk[i];
 		if (mze->mze_name[0]) {
 			rc = callback(mze->mze_name, mze->mze_value);
 			if (rc != 0)
 				return (rc);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * List a fatzap directory. Assumes that the zap scratch buffer contains
  * the directory header.
  */
 static int
 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
 {
 	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 	fat_zap_t z;
 	int i, j, rc;
 
 	if (zh.zap_magic != ZAP_MAGIC)
 		return (EIO);
 
 	z.zap_block_shift = ilog2(bsize);
 	z.zap_phys = (zap_phys_t *) zap_scratch;
 
 	/*
 	 * This assumes that the leaf blocks start at block 1. The
 	 * documentation isn't exactly clear on this.
 	 */
 	zap_leaf_t zl;
 	zl.l_bs = z.zap_block_shift;
 	for (i = 0; i < zh.zap_num_leafs; i++) {
 		off_t off = (i + 1) << zl.l_bs;
 		char name[256], *p;
 		uint64_t value;
 
 		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
 			return (EIO);
 
 		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 
 		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
 			zap_leaf_chunk_t *zc, *nc;
 			int namelen;
 
 			zc = &ZAP_LEAF_CHUNK(&zl, j);
 			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
 				continue;
 			namelen = zc->l_entry.le_name_numints;
 			if (namelen > sizeof(name))
 				namelen = sizeof(name);
 
 			/*
 			 * Paste the name back together.
 			 */
 			nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
 			p = name;
 			while (namelen > 0) {
 				int len;
 				len = namelen;
 				if (len > ZAP_LEAF_ARRAY_BYTES)
 					len = ZAP_LEAF_ARRAY_BYTES;
 				memcpy(p, nc->l_array.la_array, len);
 				p += len;
 				namelen -= len;
 				nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
 			}
 
 			/*
 			 * Assume the first eight bytes of the value are
 			 * a uint64_t.
 			 */
 			value = fzap_leaf_value(&zl, zc);
 
 			//printf("%s 0x%jx\n", name, (uintmax_t)value);
 			rc = callback((const char *)name, value);
 			if (rc != 0)
 				return (rc);
 		}
 	}
 
 	return (0);
 }
 
 static int zfs_printf(const char *name, uint64_t value __unused)
 {
 
 	printf("%s\n", name);
 
 	return (0);
 }
 
 /*
  * List a zap directory.
  */
 static int
 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
 {
 	uint64_t zap_type;
 	size_t size = dnode->dn_datablkszsec * 512;
 
 	if (dnode_read(spa, dnode, 0, zap_scratch, size))
 		return (EIO);
 
 	zap_type = *(uint64_t *) zap_scratch;
 	if (zap_type == ZBT_MICRO)
 		return mzap_list(dnode, zfs_printf);
 	else
 		return fzap_list(spa, dnode, zfs_printf);
 }
 
 static int
 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
 {
 	off_t offset;
 
 	offset = objnum * sizeof(dnode_phys_t);
 	return dnode_read(spa, &os->os_meta_dnode, offset,
 		dnode, sizeof(dnode_phys_t));
 }
 
 static int
 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
 {
 	const mzap_phys_t *mz;
 	const mzap_ent_phys_t *mze;
 	size_t size;
 	int chunks, i;
 
 	/*
 	 * Microzap objects use exactly one block. Read the whole
 	 * thing.
 	 */
 	size = dnode->dn_datablkszsec * 512;
 
 	mz = (const mzap_phys_t *) zap_scratch;
 	chunks = size / MZAP_ENT_LEN - 1;
 
 	for (i = 0; i < chunks; i++) {
 		mze = &mz->mz_chunk[i];
 		if (value == mze->mze_value) {
 			strcpy(name, mze->mze_name);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static void
 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
 {
 	size_t namelen;
 	const zap_leaf_chunk_t *nc;
 	char *p;
 
 	namelen = zc->l_entry.le_name_numints;
 
 	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
 	p = name;
 	while (namelen > 0) {
 		size_t len;
 		len = namelen;
 		if (len > ZAP_LEAF_ARRAY_BYTES)
 			len = ZAP_LEAF_ARRAY_BYTES;
 		memcpy(p, nc->l_array.la_array, len);
 		p += len;
 		namelen -= len;
 		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
 	}
 
 	*p = '\0';
 }
 
 static int
 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
 {
 	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 	fat_zap_t z;
 	int i, j;
 
 	if (zh.zap_magic != ZAP_MAGIC)
 		return (EIO);
 
 	z.zap_block_shift = ilog2(bsize);
 	z.zap_phys = (zap_phys_t *) zap_scratch;
 
 	/*
 	 * This assumes that the leaf blocks start at block 1. The
 	 * documentation isn't exactly clear on this.
 	 */
 	zap_leaf_t zl;
 	zl.l_bs = z.zap_block_shift;
 	for (i = 0; i < zh.zap_num_leafs; i++) {
 		off_t off = (i + 1) << zl.l_bs;
 
 		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
 			return (EIO);
 
 		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 
 		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
 			zap_leaf_chunk_t *zc;
 
 			zc = &ZAP_LEAF_CHUNK(&zl, j);
 			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
 				continue;
 			if (zc->l_entry.le_value_intlen != 8 ||
 			    zc->l_entry.le_value_numints != 1)
 				continue;
 
 			if (fzap_leaf_value(&zl, zc) == value) {
 				fzap_name_copy(&zl, zc, name);
 				return (0);
 			}
 		}
 	}
 
 	return (ENOENT);
 }
 
 static int
 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
 {
 	int rc;
 	uint64_t zap_type;
 	size_t size = dnode->dn_datablkszsec * 512;
 
 	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
 	if (rc)
 		return (rc);
 
 	zap_type = *(uint64_t *) zap_scratch;
 	if (zap_type == ZBT_MICRO)
 		return mzap_rlookup(spa, dnode, name, value);
 	else
 		return fzap_rlookup(spa, dnode, name, value);
 }
 
 static int
 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
 {
 	char name[256];
 	char component[256];
 	uint64_t dir_obj, parent_obj, child_dir_zapobj;
 	dnode_phys_t child_dir_zap, dataset, dir, parent;
 	dsl_dir_phys_t *dd;
 	dsl_dataset_phys_t *ds;
 	char *p;
 	int len;
 
 	p = &name[sizeof(name) - 1];
 	*p = '\0';
 
 	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
 		return (EIO);
 	}
 	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
 	dir_obj = ds->ds_dir_obj;
 
 	for (;;) {
 		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
 			return (EIO);
 		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
 
 		/* Actual loop condition. */
 		parent_obj  = dd->dd_parent_obj;
 		if (parent_obj == 0)
 			break;
 
 		if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
 			return (EIO);
 		dd = (dsl_dir_phys_t *)&parent.dn_bonus;
 		child_dir_zapobj = dd->dd_child_dir_zapobj;
 		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
 			return (EIO);
 		if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
 			return (EIO);
 
 		len = strlen(component);
 		p -= len;
 		memcpy(p, component, len);
 		--p;
 		*p = '/';
 
 		/* Actual loop iteration. */
 		dir_obj = parent_obj;
 	}
 
 	if (*p != '\0')
 		++p;
 	strcpy(result, p);
 
 	return (0);
 }
 
 static int
 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
 {
 	char element[256];
 	uint64_t dir_obj, child_dir_zapobj;
 	dnode_phys_t child_dir_zap, dir;
 	dsl_dir_phys_t *dd;
 	const char *p, *q;
 
 	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
 		return (EIO);
 	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
 	    1, &dir_obj))
 		return (EIO);
 
 	p = name;
 	for (;;) {
 		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
 			return (EIO);
 		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
 
 		while (*p == '/')
 			p++;
 		/* Actual loop condition #1. */
 		if (*p == '\0')
 			break;
 
 		q = strchr(p, '/');
 		if (q) {
 			memcpy(element, p, q - p);
 			element[q - p] = '\0';
 			p = q + 1;
 		} else {
 			strcpy(element, p);
 			p += strlen(p);
 		}
 
 		child_dir_zapobj = dd->dd_child_dir_zapobj;
 		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
 			return (EIO);
 
 		/* Actual loop condition #2. */
 		if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
 		    1, &dir_obj) != 0)
 			return (ENOENT);
 	}
 
 	*objnum = dd->dd_head_dataset_obj;
 	return (0);
 }
 
 #ifndef BOOT2
 static int
 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
 {
 	uint64_t dir_obj, child_dir_zapobj;
 	dnode_phys_t child_dir_zap, dir, dataset;
 	dsl_dataset_phys_t *ds;
 	dsl_dir_phys_t *dd;
 
 	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
 		return (EIO);
 	}
 	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
 	dir_obj = ds->ds_dir_obj;
 
 	if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
 		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
 		return (EIO);
 	}
 	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
 
 	child_dir_zapobj = dd->dd_child_dir_zapobj;
 	if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
 		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
 		return (EIO);
 	}
 
 	return (zap_list(spa, &child_dir_zap) != 0);
 }
 
 int
 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t))
 {
 	uint64_t dir_obj, child_dir_zapobj, zap_type;
 	dnode_phys_t child_dir_zap, dir, dataset;
 	dsl_dataset_phys_t *ds;
 	dsl_dir_phys_t *dd;
 	int err;
 
 	err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
 	if (err != 0) {
 		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
 		return (err);
 	}
 	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
 	dir_obj = ds->ds_dir_obj;
 
 	err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
 	if (err != 0) {
 		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
 		return (err);
 	}
 	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
 
 	child_dir_zapobj = dd->dd_child_dir_zapobj;
 	err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
 	if (err != 0) {
 		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
 		return (err);
 	}
 
 	err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
 	if (err != 0)
 		return (err);
 
 	zap_type = *(uint64_t *) zap_scratch;
 	if (zap_type == ZBT_MICRO)
 		return mzap_list(&child_dir_zap, callback);
 	else
 		return fzap_list(spa, &child_dir_zap, callback);
 }
 #endif
 
 /*
  * Find the object set given the object number of its dataset object
  * and return its details in *objset
  */
 static int
 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
 {
 	dnode_phys_t dataset;
 	dsl_dataset_phys_t *ds;
 
 	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
 		return (EIO);
 	}
 
 	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
 	if (zio_read(spa, &ds->ds_bp, objset)) {
 		printf("ZFS: can't read object set for dataset %ju\n",
 		    (uintmax_t)objnum);
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * Find the object set pointed to by the BOOTFS property or the root
  * dataset if there is none and return its details in *objset
  */
 static int
 zfs_get_root(const spa_t *spa, uint64_t *objid)
 {
 	dnode_phys_t dir, propdir;
 	uint64_t props, bootfs, root;
 
 	*objid = 0;
 
 	/*
 	 * Start with the MOS directory object.
 	 */
 	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
 		printf("ZFS: can't read MOS object directory\n");
 		return (EIO);
 	}
 
 	/*
 	 * Lookup the pool_props and see if we can find a bootfs.
 	 */
 	if (zap_lookup(spa, &dir, DMU_POOL_PROPS, sizeof (props), 1, &props) == 0
 	     && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
 	     && zap_lookup(spa, &propdir, "bootfs", sizeof (bootfs), 1, &bootfs) == 0
 	     && bootfs != 0)
 	{
 		*objid = bootfs;
 		return (0);
 	}
 	/*
 	 * Lookup the root dataset directory
 	 */
 	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root)
 	    || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
 		printf("ZFS: can't find root dsl_dir\n");
 		return (EIO);
 	}
 
 	/*
 	 * Use the information from the dataset directory's bonus buffer
 	 * to find the dataset object and from that the object set itself.
 	 */
 	dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
 	*objid = dd->dd_head_dataset_obj;
 	return (0);
 }
 
 static int
 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
 {
 
 	mount->spa = spa;
 
 	/*
 	 * Find the root object set if not explicitly provided
 	 */
 	if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
 		printf("ZFS: can't find root filesystem\n");
 		return (EIO);
 	}
 
 	if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
 		printf("ZFS: can't open root filesystem\n");
 		return (EIO);
 	}
 
 	mount->rootobj = rootobj;
 
 	return (0);
 }
 
 /*
  * callback function for feature name checks.
  */
 static int
 check_feature(const char *name, uint64_t value)
 {
 	int i;
 
 	if (value == 0)
 		return (0);
 	if (name[0] == '\0')
 		return (0);
 
 	for (i = 0; features_for_read[i] != NULL; i++) {
 		if (strcmp(name, features_for_read[i]) == 0)
 			return (0);
 	}
 	printf("ZFS: unsupported feature: %s\n", name);
 	return (EIO);
 }
 
 /*
  * Checks whether the MOS features that are active are supported.
  */
 static int
 check_mos_features(const spa_t *spa)
 {
 	dnode_phys_t dir;
 	uint64_t objnum, zap_type;
 	size_t size;
 	int rc;
 
 	if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
 	    &dir)) != 0)
 		return (rc);
 	if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
 	    sizeof (objnum), 1, &objnum)) != 0) {
 		/*
 		 * It is older pool without features. As we have already
 		 * tested the label, just return without raising the error.
 		 */
 		return (0);
 	}
 
 	if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
 		return (rc);
 
 	if (dir.dn_type != DMU_OTN_ZAP_METADATA)
 		return (EIO);
 
 	size = dir.dn_datablkszsec * 512;
 	if (dnode_read(spa, &dir, 0, zap_scratch, size))
 		return (EIO);
 
 	zap_type = *(uint64_t *) zap_scratch;
 	if (zap_type == ZBT_MICRO)
 		rc = mzap_list(&dir, check_feature);
 	else
 		rc = fzap_list(spa, &dir, check_feature);
 
 	return (rc);
 }
 
 static int
 zfs_spa_init(spa_t *spa)
 {
 	dnode_phys_t dir;
 	int rc;
 
 	if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
 		printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
 		return (EIO);
 	}
 	if (spa->spa_mos.os_type != DMU_OST_META) {
 		printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
 		return (EIO);
 	}
 
 	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
 	    &dir)) {
 		printf("ZFS: failed to read pool %s directory object\n",
 		    spa->spa_name);
 		return (EIO);
 	}
 	/* this is allowed to fail, older pools do not have salt */
 	rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
 	    spa->spa_cksum_salt.zcs_bytes);
 
 	rc = check_mos_features(spa);
 	if (rc != 0) {
 		printf("ZFS: pool %s is not supported\n", spa->spa_name);
 	}
 
 	return (rc);
 }
 
 static int
 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
 {
 
 	if (dn->dn_bonustype != DMU_OT_SA) {
 		znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
 
 		sb->st_mode = zp->zp_mode;
 		sb->st_uid = zp->zp_uid;
 		sb->st_gid = zp->zp_gid;
 		sb->st_size = zp->zp_size;
 	} else {
 		sa_hdr_phys_t *sahdrp;
 		int hdrsize;
 		size_t size = 0;
 		void *buf = NULL;
 
 		if (dn->dn_bonuslen != 0)
 			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
 		else {
 			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
 				blkptr_t *bp = &dn->dn_spill;
 				int error;
 
 				size = BP_GET_LSIZE(bp);
 				buf = zfs_alloc(size);
 				error = zio_read(spa, bp, buf);
 				if (error != 0) {
 					zfs_free(buf, size);
 					return (error);
 				}
 				sahdrp = buf;
 			} else {
 				return (EIO);
 			}
 		}
 		hdrsize = SA_HDR_SIZE(sahdrp);
 		sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
 		    SA_MODE_OFFSET);
 		sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
 		    SA_UID_OFFSET);
 		sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
 		    SA_GID_OFFSET);
 		sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
 		    SA_SIZE_OFFSET);
 		if (buf != NULL)
 			zfs_free(buf, size);
 	}
 
 	return (0);
 }
 
 /*
  * Lookup a file and return its dnode.
  */
 static int
 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
 {
 	int rc;
 	uint64_t objnum, rootnum, parentnum;
 	const spa_t *spa;
 	dnode_phys_t dn;
 	const char *p, *q;
 	char element[256];
 	char path[1024];
 	int symlinks_followed = 0;
 	struct stat sb;
 
 	spa = mount->spa;
 	if (mount->objset.os_type != DMU_OST_ZFS) {
 		printf("ZFS: unexpected object set type %ju\n",
 		    (uintmax_t)mount->objset.os_type);
 		return (EIO);
 	}
 
 	/*
 	 * Get the root directory dnode.
 	 */
 	rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
 	if (rc)
 		return (rc);
 
 	rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (rootnum), 1, &rootnum);
 	if (rc)
 		return (rc);
 
 	rc = objset_get_dnode(spa, &mount->objset, rootnum, &dn);
 	if (rc)
 		return (rc);
 
 	objnum = rootnum;
 	p = upath;
 	while (p && *p) {
 		while (*p == '/')
 			p++;
 		if (!*p)
 			break;
 		q = strchr(p, '/');
 		if (q) {
 			memcpy(element, p, q - p);
 			element[q - p] = 0;
 			p = q;
 		} else {
 			strcpy(element, p);
-			p = 0;
+			p = NULL;
 		}
 
 		rc = zfs_dnode_stat(spa, &dn, &sb);
 		if (rc)
 			return (rc);
 		if (!S_ISDIR(sb.st_mode))
 			return (ENOTDIR);
 
 		parentnum = objnum;
 		rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
 		if (rc)
 			return (rc);
 		objnum = ZFS_DIRENT_OBJ(objnum);
 
 		rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
 		if (rc)
 			return (rc);
 
 		/*
 		 * Check for symlink.
 		 */
 		rc = zfs_dnode_stat(spa, &dn, &sb);
 		if (rc)
 			return (rc);
 		if (S_ISLNK(sb.st_mode)) {
 			if (symlinks_followed > 10)
 				return (EMLINK);
 			symlinks_followed++;
 
 			/*
 			 * Read the link value and copy the tail of our
 			 * current path onto the end.
 			 */
 			if (p)
 				strcpy(&path[sb.st_size], p);
 			else
 				path[sb.st_size] = 0;
 			/*
 			 * Second test is purely to silence bogus compiler
 			 * warning about accessing past the end of dn_bonus.
 			 */
 			if (sb.st_size + sizeof(znode_phys_t) <=
 			    dn.dn_bonuslen && sizeof(znode_phys_t) <=
 			    sizeof(dn.dn_bonus)) {
 				memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
 					sb.st_size);
 			} else {
 				rc = dnode_read(spa, &dn, 0, path, sb.st_size);
 				if (rc)
 					return (rc);
 			}
 
 			/*
 			 * Restart with the new path, starting either at
 			 * the root or at the parent depending whether or
 			 * not the link is relative.
 			 */
 			p = path;
 			if (*p == '/')
 				objnum = rootnum;
 			else
 				objnum = parentnum;
 			objset_get_dnode(spa, &mount->objset, objnum, &dn);
 		}
 	}
 
 	*dnode = dn;
 	return (0);
 }
Index: head/sys/libkern/iconv_xlat16.c
===================================================================
--- head/sys/libkern/iconv_xlat16.c	(revision 314067)
+++ head/sys/libkern/iconv_xlat16.c	(revision 314068)
@@ -1,363 +1,363 @@
 /*-
  * Copyright (c) 2003, 2005 Ryuichiro Imura
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/iconv.h>
 
 #include "iconv_converter_if.h"
 
 /*
  * "XLAT16" converter
  */
 
 #ifdef MODULE_DEPEND
 MODULE_DEPEND(iconv_xlat16, libiconv, 2, 2, 2);
 #endif
 
 #define C2I1(c)	((c) & 0x8000 ? ((c) & 0xff) | 0x100 : (c) & 0xff)
 #define C2I2(c)	((c) & 0x8000 ? ((c) >> 8) & 0x7f : ((c) >> 8) & 0xff)
 
 /*
  * XLAT16 converter instance
  */
 struct iconv_xlat16 {
 	KOBJ_FIELDS;
 	uint32_t *		d_table[0x200];
 	void *			f_ctp;
 	void *			t_ctp;
 	struct iconv_cspair *	d_csp;
 };
 
 static int
 iconv_xlat16_open(struct iconv_converter_class *dcp,
 	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
 {
 	struct iconv_xlat16 *dp;
 	uint32_t *headp, **idxp;
 	int i;
 
 	dp = (struct iconv_xlat16 *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
 	headp = (uint32_t *)((caddr_t)csp->cp_data + sizeof(dp->d_table));
 	idxp = (uint32_t **)csp->cp_data;
 	for (i = 0 ; i < 0x200 ; i++) {
 		if (*idxp) {
 			dp->d_table[i] = headp;
 			headp += 0x80;
 		} else {
 			dp->d_table[i] = NULL;
 		}
 		idxp++;
 	}
 
 	if (strcmp(csp->cp_to, KICONV_WCTYPE_NAME) != 0) {
 		if (iconv_open(KICONV_WCTYPE_NAME, csp->cp_from, &dp->f_ctp) != 0)
 			dp->f_ctp = NULL;
 		if (iconv_open(KICONV_WCTYPE_NAME, csp->cp_to, &dp->t_ctp) != 0)
 			dp->t_ctp = NULL;
 	} else {
 		dp->f_ctp = dp->t_ctp = dp;
 	}
 
 	dp->d_csp = csp;
 	csp->cp_refcount++;
 	*dpp = (void*)dp;
 	return (0);
 }
 
 static int
 iconv_xlat16_close(void *data)
 {
 	struct iconv_xlat16 *dp = data;
 
 	if (dp->f_ctp && dp->f_ctp != data)
 		iconv_close(dp->f_ctp);
 	if (dp->t_ctp && dp->t_ctp != data)
 		iconv_close(dp->t_ctp);
 	dp->d_csp->cp_refcount--;
 	kobj_delete((struct kobj*)data, M_ICONV);
 	return (0);
 }
 
 static int
 iconv_xlat16_conv(void *d2p, const char **inbuf,
 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
 	int convchar, int casetype)
 {
 	struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p;
 	const char *src;
 	char *dst;
 	int nullin, ret = 0;
 	size_t in, on, ir, or, inlen;
 	uint32_t code;
 	u_char u, l;
 	uint16_t c1, c2, ctmp;
 
 	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
 		return (0);
 	ir = in = *inbytesleft;
 	or = on = *outbytesleft;
 	src = *inbuf;
 	dst = *outbuf;
 
 	while(ir > 0 && or > 0) {
 
 		inlen = 0;
 		code = 0;
 
 		c1 = ir > 1 ? *(src+1) & 0xff : 0;
 		c2 = *src & 0xff;
 		ctmp = 0;
 
 		c1 = c2 & 0x80 ? c1 | 0x100 : c1;
 		c2 = c2 & 0x80 ? c2 & 0x7f : c2;
 
 		if (ir > 1 && dp->d_table[c1] && dp->d_table[c1][c2]) {
 			/*
 			 * inbuf char is a double byte char
 			 */
 			inlen = 2;
 
 			/* toupper,tolower */
 			if (casetype == KICONV_FROM_LOWER && dp->f_ctp)
 				ctmp = towlower(((u_char)*src << 8) | (u_char)*(src + 1),
 				    dp->f_ctp);
 			else if (casetype == KICONV_FROM_UPPER && dp->f_ctp)
 				ctmp = towupper(((u_char)*src << 8) | (u_char)*(src + 1),
 				    dp->f_ctp);
 			if (ctmp) {
 				c1 = C2I1(ctmp);
 				c2 = C2I2(ctmp);
 			}
 		}
 
 		if (inlen == 0) {
 			c1 &= 0xff00;
 			if (!dp->d_table[c1]) {
 				ret = -1;
 				break;
 			}
 			/*
 			 * inbuf char is a single byte char
 			 */
 			inlen = 1;
 
 			if (casetype & (KICONV_FROM_LOWER|KICONV_FROM_UPPER))
 				code = dp->d_table[c1][c2];
 
 			if (casetype == KICONV_FROM_LOWER) {
 				if (dp->f_ctp)
 					ctmp = towlower((u_char)*src, dp->f_ctp);
 				else if (code & XLAT16_HAS_FROM_LOWER_CASE)
 					ctmp = (u_char)(code >> 16);
 			} else if (casetype == KICONV_FROM_UPPER) {
 				if (dp->f_ctp)
 					ctmp = towupper((u_char)*src, dp->f_ctp);
 				else if (code & XLAT16_HAS_FROM_UPPER_CASE)
 					ctmp = (u_char)(code >> 16);
 			}
 			if (ctmp) {
 				c1 = C2I1(ctmp << 8);
 				c2 = C2I2(ctmp << 8);
 			}
 		}
 
 		code = dp->d_table[c1][c2];
 		if (!code) {
 			ret = -1;
 			break;
 		}
 
 		nullin = (code & XLAT16_ACCEPT_NULL_IN) ? 1 : 0;
 		if (inlen == 1 && nullin) {
 			/*
 			 * XLAT16_ACCEPT_NULL_IN requires inbuf has 2byte
 			 */
 			ret = -1;
 			break;
 		}
 
 		/*
 		 * now start translation
 		 */
 		u = (u_char)(code >> 8);
 		l = (u_char)code;
 
 #ifdef XLAT16_ACCEPT_3BYTE_CHR
 		if (code & XLAT16_IS_3BYTE_CHR) {
 			if (or < 3) {
 				ret = -1;
 				break;
 			}
 			*dst++ = u;
 			*dst++ = l;
 			*dst++ = (u_char)(code >> 16);
 			or -= 3;
 		} else
 #endif
 		if (u || code & XLAT16_ACCEPT_NULL_OUT) {
 			if (or < 2) {
 				ret = -1;
 				break;
 			}
 
 			/* toupper,tolower */
 			if (casetype == KICONV_LOWER && dp->t_ctp) {
 				code = towlower((uint16_t)code, dp->t_ctp);
 				u = (u_char)(code >> 8);
 				l = (u_char)code;
 			}
 			if (casetype == KICONV_UPPER && dp->t_ctp) {
 				code = towupper((uint16_t)code, dp->t_ctp);
 				u = (u_char)(code >> 8);
 				l = (u_char)code;
 			}
 
 			*dst++ = u;
 			*dst++ = l;
 			or -= 2;
 		} else {
 			/* toupper,tolower */
 			if (casetype == KICONV_LOWER) {
 				if (dp->t_ctp)
 					l = (u_char)towlower(l, dp->t_ctp);
 				else if (code & XLAT16_HAS_LOWER_CASE)
 					l = (u_char)(code >> 16);
 			}
 			if (casetype == KICONV_UPPER) {
 				if (dp->t_ctp)
 					l = (u_char)towupper(l, dp->t_ctp);
 				else if (code & XLAT16_HAS_UPPER_CASE)
 					l = (u_char)(code >> 16);
 			}
 
 			*dst++ = l;
 			or--;
 		}
 
 		if (inlen == 2) {
 			/*
 			 * there is a case that inbuf char is a single
 			 * byte char while inlen == 2
 			 */
-			if ((u_char)*(src+1) == 0 && !nullin ) {
+			if ((u_char)*(src+1) == '\0' && !nullin ) {
 				src++;
 				ir--;
 			} else {
 				src += 2;
 				ir -= 2;
 			}
 		} else {
 			src++;
 			ir--;
 		}
 
 		if (convchar == 1)
 			break;
 	}
 
 	*inbuf += in - ir;
 	*outbuf += on - or;
 	*inbytesleft -= in - ir;
 	*outbytesleft -= on - or;
 	return (ret);
 }
 
 static const char *
 iconv_xlat16_name(struct iconv_converter_class *dcp)
 {
 	return ("xlat16");
 }
 
 static int
 iconv_xlat16_tolower(void *d2p, int c)
 {
         struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p;
 	int c1, c2, out;
 
 	if (c < 0x100) {
 		c1 = C2I1(c << 8);
 		c2 = C2I2(c << 8);
 	} else if (c < 0x10000) {
                 c1 = C2I1(c);
                 c2 = C2I2(c);
 	} else
 		return (c);
 
 	if (dp->d_table[c1] && dp->d_table[c1][c2] & XLAT16_HAS_LOWER_CASE) {
 		/*return (int)(dp->d_table[c1][c2] & 0xffff);*/
 		out = dp->d_table[c1][c2] & 0xffff;
 		if ((out & 0xff) == 0)
 			out = (out >> 8) & 0xff;
 		return (out);
 	} else
 		return (c);
 }
 
 static int
 iconv_xlat16_toupper(void *d2p, int c)
 {
         struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p;
 	int c1, c2, out;
 
 	if (c < 0x100) {
 		c1 = C2I1(c << 8);
 		c2 = C2I2(c << 8);
 	} else if (c < 0x10000) {
                 c1 = C2I1(c);
                 c2 = C2I2(c);
 	} else
 		return (c);
 
 	if (dp->d_table[c1] && dp->d_table[c1][c2] & XLAT16_HAS_UPPER_CASE) {
 		out = dp->d_table[c1][c2] & 0xffff;
 		if ((out & 0xff) == 0)
 			out = (out >> 8) & 0xff;
 		return (out);
 	} else
 		return (c);
 }
 
 static kobj_method_t iconv_xlat16_methods[] = {
 	KOBJMETHOD(iconv_converter_open,	iconv_xlat16_open),
 	KOBJMETHOD(iconv_converter_close,	iconv_xlat16_close),
 	KOBJMETHOD(iconv_converter_conv,	iconv_xlat16_conv),
 #if 0
 	KOBJMETHOD(iconv_converter_init,	iconv_xlat16_init),
 	KOBJMETHOD(iconv_converter_done,	iconv_xlat16_done),
 #endif
 	KOBJMETHOD(iconv_converter_name,	iconv_xlat16_name),
 	KOBJMETHOD(iconv_converter_tolower,	iconv_xlat16_tolower),
 	KOBJMETHOD(iconv_converter_toupper,	iconv_xlat16_toupper),
 	{0, 0}
 };
 
 KICONV_CONVERTER(xlat16, sizeof(struct iconv_xlat16));
Index: head/sys/mips/atheros/ar531x/apb.c
===================================================================
--- head/sys/mips/atheros/ar531x/apb.c	(revision 314067)
+++ head/sys/mips/atheros/ar531x/apb.c	(revision 314068)
@@ -1,756 +1,756 @@
 /*-
  * Copyright (c) 2016, Hiroki Mori
  * Copyright (c) 2009, Oleksandr Tymoshenko <gonzo@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_platform.h"
 #include "opt_ar531x.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/pmc.h>
 #include <sys/pmckern.h>
 
 #include <machine/bus.h>
 #ifdef INTRNG
 #include <machine/intr.h>
 #else
 #include <machine/intr_machdep.h>
 #endif
 
 #ifdef INTRNG
 #include "pic_if.h"
 
 #define PIC_INTR_ISRC(sc, irq)	(&(sc)->pic_irqs[(irq)].isrc)
 #endif
 
 #include <mips/atheros/ar531x/apbvar.h>
 #include <mips/atheros/ar531x/ar5315reg.h>
 #include <mips/atheros/ar531x/ar5312reg.h>
 #include <mips/atheros/ar531x/ar5315_setup.h>
 
 #ifdef AR531X_APB_DEBUG
 #define dprintf printf
 #else 
 #define dprintf(x, arg...)
 #endif  /* AR531X_APB_DEBUG */
 
 static int	apb_activate_resource(device_t, device_t, int, int,
 		    struct resource *);
 static device_t	apb_add_child(device_t, u_int, const char *, int);
 static struct resource *
 		apb_alloc_resource(device_t, device_t, int, int *, rman_res_t,
 		    rman_res_t, rman_res_t, u_int);
 static int	apb_attach(device_t);
 static int	apb_deactivate_resource(device_t, device_t, int, int,
 		    struct resource *);
 static struct resource_list *
 		apb_get_resource_list(device_t, device_t);
 static void	apb_hinted_child(device_t, const char *, int);
 static int	apb_filter(void *);
 static int	apb_probe(device_t);
 static int	apb_release_resource(device_t, device_t, int, int,
 		    struct resource *);
 #ifndef INTRNG
 static int	apb_setup_intr(device_t, device_t, struct resource *, int,
 		    driver_filter_t *, driver_intr_t *, void *, void **);
 static int	apb_teardown_intr(device_t, device_t, struct resource *,
 		    void *);
 #endif
 
 static void 
 apb_mask_irq(void *source)
 {
 	unsigned int irq = (unsigned int)source;
 	uint32_t reg;
 
 	if(ar531x_soc >= AR531X_SOC_AR5315) {
 		reg = ATH_READ_REG(AR5315_SYSREG_BASE +
 			AR5315_SYSREG_MISC_INTMASK);
 		ATH_WRITE_REG(AR5315_SYSREG_BASE
 			+ AR5315_SYSREG_MISC_INTMASK, reg & ~(1 << irq));
 	} else {
 		reg = ATH_READ_REG(AR5312_SYSREG_BASE +
 			AR5312_SYSREG_MISC_INTMASK);
 		ATH_WRITE_REG(AR5312_SYSREG_BASE
 			+ AR5312_SYSREG_MISC_INTMASK, reg & ~(1 << irq));
 	}
 }
 
 static void 
 apb_unmask_irq(void *source)
 {
 	uint32_t reg;
 	unsigned int irq = (unsigned int)source;
 
 	if(ar531x_soc >= AR531X_SOC_AR5315) {
 		reg = ATH_READ_REG(AR5315_SYSREG_BASE +
 			AR5315_SYSREG_MISC_INTMASK);
 		ATH_WRITE_REG(AR5315_SYSREG_BASE +
 			AR5315_SYSREG_MISC_INTMASK, reg | (1 << irq));
 	} else {
 		reg = ATH_READ_REG(AR5312_SYSREG_BASE +
 			AR5312_SYSREG_MISC_INTMASK);
 		ATH_WRITE_REG(AR5312_SYSREG_BASE +
 			AR5312_SYSREG_MISC_INTMASK, reg | (1 << irq));
 	}
 }
 
 #ifdef INTRNG
 static int
 apb_pic_register_isrcs(struct apb_softc *sc)
 {
 	int error;
 	uint32_t irq;
 	struct intr_irqsrc *isrc;
 	const char *name;
 
 	name = device_get_nameunit(sc->apb_dev);
 	for (irq = 0; irq < APB_NIRQS; irq++) {
 		sc->pic_irqs[irq].irq = irq;
 		isrc = PIC_INTR_ISRC(sc, irq);
 		error = intr_isrc_register(isrc, sc->apb_dev, 0, "%s", name);
 		if (error != 0) {
 			/* XXX call intr_isrc_deregister */
 			device_printf(sc->apb_dev, "%s failed", __func__);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static inline intptr_t
 pic_xref(device_t dev)
 {
         return (0);
 }
 #endif
 
 static int
 apb_probe(device_t dev)
 {
 #ifdef INTRNG
 	device_set_desc(dev, "APB Bus bridge INTRNG");
 #else
 	device_set_desc(dev, "APB Bus bridge");
 #endif
 
 	return (0);
 }
 
 static int
 apb_attach(device_t dev)
 {
 	struct apb_softc *sc = device_get_softc(dev);
 #ifdef INTRNG
 	intptr_t xref = pic_xref(dev);
 	int miscirq;
 #else
 	int rid = 0;
 #endif
 
 	sc->apb_dev = dev;
 
 	sc->apb_mem_rman.rm_type = RMAN_ARRAY;
 	sc->apb_mem_rman.rm_descr = "APB memory window";
 
 	if(ar531x_soc >= AR531X_SOC_AR5315) {
 		if (rman_init(&sc->apb_mem_rman) != 0 ||
 		    rman_manage_region(&sc->apb_mem_rman, 
 			AR5315_APB_BASE, 
 			AR5315_APB_BASE + AR5315_APB_SIZE - 1) != 0)
 			panic("apb_attach: failed to set up memory rman");
 	} else {
 		if (rman_init(&sc->apb_mem_rman) != 0 ||
 		    rman_manage_region(&sc->apb_mem_rman, 
 			AR5312_APB_BASE, 
 			AR5312_APB_BASE + AR5312_APB_SIZE - 1) != 0)
 			panic("apb_attach: failed to set up memory rman");
 	}
 
 	sc->apb_irq_rman.rm_type = RMAN_ARRAY;
 	sc->apb_irq_rman.rm_descr = "APB IRQ";
 
 	if (rman_init(&sc->apb_irq_rman) != 0 ||
 	    rman_manage_region(&sc->apb_irq_rman, 
 			APB_IRQ_BASE, APB_IRQ_END) != 0)
 		panic("apb_attach: failed to set up IRQ rman");
 
 #ifndef INTRNG
 	if ((sc->sc_misc_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, 
 	    RF_SHAREABLE | RF_ACTIVE)) == NULL) {
 		device_printf(dev, "unable to allocate IRQ resource\n");
 		return (ENXIO);
 	}
 
 	if ((bus_setup_intr(dev, sc->sc_misc_irq, INTR_TYPE_MISC, 
 	    apb_filter, NULL, sc, &sc->sc_misc_ih))) {
 		device_printf(dev,
 		    "WARNING: unable to register interrupt handler\n");
 		return (ENXIO);
 	}
 #else
 	/* Register the interrupts */
 	if (apb_pic_register_isrcs(sc) != 0) {
 		device_printf(dev, "could not register PIC ISRCs\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Now, when everything is initialized, it's right time to
 	 * register interrupt controller to interrupt framefork.
 	 */
 	if (intr_pic_register(dev, xref) == NULL) {
 		device_printf(dev, "could not register PIC\n");
 		return (ENXIO);
 	}
 
 	if(ar531x_soc >= AR531X_SOC_AR5315) {
 		miscirq = AR5315_CPU_IRQ_MISC;
 	} else {
 		miscirq = AR5312_IRQ_MISC;
 	}
 	cpu_establish_hardintr("aric", apb_filter, NULL, sc, miscirq,
 	    INTR_TYPE_MISC, NULL);
 #endif
 
 	/* mask all misc interrupt */
 	if(ar531x_soc >= AR531X_SOC_AR5315) {
 		ATH_WRITE_REG(AR5315_SYSREG_BASE
 			+ AR5315_SYSREG_MISC_INTMASK, 0);
 	} else {
 		ATH_WRITE_REG(AR5312_SYSREG_BASE
 			+ AR5312_SYSREG_MISC_INTMASK, 0);
 	}
 
 	bus_generic_probe(dev);
 	bus_enumerate_hinted_children(dev);
 	bus_generic_attach(dev);
 
 	return (0);
 }
 
 static struct resource *
 apb_alloc_resource(device_t bus, device_t child, int type, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct apb_softc		*sc = device_get_softc(bus);
 	struct apb_ivar			*ivar = device_get_ivars(child);
 	struct resource			*rv;
 	struct resource_list_entry	*rle;
 	struct rman			*rm;
 	int				 isdefault, needactivate, passthrough;
 
 	isdefault = (RMAN_IS_DEFAULT_RANGE(start, end));
 	needactivate = flags & RF_ACTIVE;
 	/*
 	 * Pass memory requests to nexus device
 	 */
 	passthrough = (device_get_parent(child) != bus);
 	rle = NULL;
 
 	dprintf("%s: entry (%p, %p, %d, %d, %p, %p, %jd, %d)\n",
 	    __func__, bus, child, type, *rid, (void *)(intptr_t)start,
 	    (void *)(intptr_t)end, count, flags);
 
 	if (passthrough)
 		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type,
 		    rid, start, end, count, flags));
 
 	/*
 	 * If this is an allocation of the "default" range for a given RID,
 	 * and we know what the resources for this device are (ie. they aren't
 	 * maintained by a child bus), then work out the start/end values.
 	 */
 
 	if (isdefault) {
 		rle = resource_list_find(&ivar->resources, type, *rid);
 		if (rle == NULL) {
 			return (NULL);
 		}
 
 		if (rle->res != NULL) {
 			panic("%s: resource entry is busy", __func__);
 		}
 		start = rle->start;
 		end = rle->end;
 		count = rle->count;
 
 		dprintf("%s: default resource (%p, %p, %jd)\n",
 		    __func__, (void *)(intptr_t)start,
 		    (void *)(intptr_t)end, count);
 	}
 
 	switch (type) {
 	case SYS_RES_IRQ:
 		rm = &sc->apb_irq_rman;
 		break;
 	case SYS_RES_MEMORY:
 		rm = &sc->apb_mem_rman;
 		break;
 	default:
 		printf("%s: unknown resource type %d\n", __func__, type);
 		return (0);
 	}
 
 	rv = rman_reserve_resource(rm, start, end, count, flags, child);
-	if (rv == 0) {
+	if (rv == NULL) {
 		printf("%s: could not reserve resource %d\n", __func__, type);
 		return (0);
 	}
 
 	rman_set_rid(rv, *rid);
 
 	if (needactivate) {
 		if (bus_activate_resource(child, type, *rid, rv)) {
 			printf("%s: could not activate resource\n", __func__);
 			rman_release_resource(rv);
 			return (0);
 		}
 	}
 
 	return (rv);
 }
 
 static int
 apb_activate_resource(device_t bus, device_t child, int type, int rid,
     struct resource *r)
 {
 
 	/* XXX: should we mask/unmask IRQ here? */
 	return (BUS_ACTIVATE_RESOURCE(device_get_parent(bus), child,
 		type, rid, r));
 }
 
 static int
 apb_deactivate_resource(device_t bus, device_t child, int type, int rid,
     struct resource *r)
 {
 
 	/* XXX: should we mask/unmask IRQ here? */
 	return (BUS_DEACTIVATE_RESOURCE(device_get_parent(bus), child,
 		type, rid, r));
 }
 
 static int
 apb_release_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	struct resource_list *rl;
 	struct resource_list_entry *rle;
 
 	rl = apb_get_resource_list(dev, child);
 	if (rl == NULL)
 		return (EINVAL);
 	rle = resource_list_find(rl, type, rid);
 	if (rle == NULL)
 		return (EINVAL);
 	rman_release_resource(r);
 	rle->res = NULL;
 
 	return (0);
 }
 
 
 static int
 apb_setup_intr(device_t bus, device_t child, struct resource *ires,
 		int flags, driver_filter_t *filt, driver_intr_t *handler,
 		void *arg, void **cookiep)
 {
 	struct apb_softc *sc = device_get_softc(bus);
 	int error;
 	int irq;
 #ifndef INTRNG
 	struct intr_event *event;
 #endif
 
 #ifdef INTRNG
 	struct intr_irqsrc *isrc;
 	const char *name;
 	
 	if ((rman_get_flags(ires) & RF_SHAREABLE) == 0)
 		flags |= INTR_EXCL;
 
 	irq = rman_get_start(ires);
 	isrc = PIC_INTR_ISRC(sc, irq);
 	if(isrc->isrc_event == 0) {
 		error = intr_event_create(&isrc->isrc_event, (void *)irq,
 		    0, irq, apb_mask_irq, apb_unmask_irq,
 		    NULL, NULL, "apb intr%d:", irq);
 		if(error != 0)
 			return(error);
 	}
 	name = device_get_nameunit(child);
 	error = intr_event_add_handler(isrc->isrc_event, name, filt, handler,
             arg, intr_priority(flags), flags, cookiep);
 	return(error);
 #else
 	irq = rman_get_start(ires);
 
 	if (irq > APB_IRQ_END)
 		panic("%s: bad irq %d", __func__, irq);
 
 	event = sc->sc_eventstab[irq];
 	if (event == NULL) {
 		error = intr_event_create(&event, (void *)irq, 0, irq, 
 		    apb_mask_irq, apb_unmask_irq,
 		    NULL, NULL,
 		    "apb intr%d:", irq);
 
 		if (error == 0) {
 			sc->sc_eventstab[irq] = event;
 			sc->sc_intr_counter[irq] =
 			    mips_intrcnt_create(event->ie_name);
 		}
 		else
 			return (error);
 	}
 
 	intr_event_add_handler(event, device_get_nameunit(child), filt,
 	    handler, arg, intr_priority(flags), flags, cookiep);
 	mips_intrcnt_setname(sc->sc_intr_counter[irq], event->ie_fullname);
 
 	apb_unmask_irq((void*)irq);
 
 	return (0);
 #endif
 }
 
 #ifndef INTRNG
 static int
 apb_teardown_intr(device_t dev, device_t child, struct resource *ires,
     void *cookie)
 {
 #ifdef INTRNG
 	 return (intr_teardown_irq(child, ires, cookie));
 #else
 	struct apb_softc *sc = device_get_softc(dev);
 	int irq, result;
 
 	irq = rman_get_start(ires);
 	if (irq > APB_IRQ_END)
 		panic("%s: bad irq %d", __func__, irq);
 
 	if (sc->sc_eventstab[irq] == NULL)
 		panic("Trying to teardown unoccupied IRQ");
 
 	apb_mask_irq((void*)irq);
 
 	result = intr_event_remove_handler(cookie);
 	if (!result)
 		sc->sc_eventstab[irq] = NULL;
 
 	return (result);
 #endif
 }
 
 
 static int
 apb_filter(void *arg)
 {
 	struct apb_softc *sc = arg;
 	struct intr_event *event;
 	uint32_t reg, irq;
 
 	if(ar531x_soc >= AR531X_SOC_AR5315)
 		reg = ATH_READ_REG(AR5315_SYSREG_BASE +
 			AR5315_SYSREG_MISC_INTSTAT);
 	else
 		reg = ATH_READ_REG(AR5312_SYSREG_BASE +
 			AR5312_SYSREG_MISC_INTSTAT);
 
 	for (irq = 0; irq < APB_NIRQS; irq++) {
 		if (reg & (1 << irq)) {
 
 			if(ar531x_soc >= AR531X_SOC_AR5315) {
 				ATH_WRITE_REG(AR5315_SYSREG_BASE +
 				    AR5315_SYSREG_MISC_INTSTAT,
 				    reg & ~(1 << irq));
 			} else {
 				ATH_WRITE_REG(AR5312_SYSREG_BASE +
 				    AR5312_SYSREG_MISC_INTSTAT,
 				    reg & ~(1 << irq));
 			}
 
 			event = sc->sc_eventstab[irq];
 			if (!event || TAILQ_EMPTY(&event->ie_handlers)) {
 				if(irq == 1 && ar531x_soc < AR531X_SOC_AR5315) {
 					ATH_READ_REG(AR5312_SYSREG_BASE +
 						AR5312_SYSREG_AHBPERR);
 					ATH_READ_REG(AR5312_SYSREG_BASE +
 						AR5312_SYSREG_AHBDMAE);
 				}
 				/* Ignore non handle interrupts */
 				if (irq != 0 && irq != 6)
 					printf("Stray APB IRQ %d\n", irq);
 
 				continue;
 			}
 
 			intr_event_handle(event, PCPU_GET(curthread)->td_intr_frame);
 			mips_intrcnt_inc(sc->sc_intr_counter[irq]);
 		}
 	}
 
 	return (FILTER_HANDLED);
 }
 #else
 static int
 apb_filter(void *arg)
 {
 	struct apb_softc *sc = arg;
 	struct thread *td;
 	uint32_t i, intr;
 
 	td = curthread;
 	/* Workaround: do not inflate intr nesting level */
 	td->td_intr_nesting_level--;
 
 	if(ar531x_soc >= AR531X_SOC_AR5315)
 		intr = ATH_READ_REG(AR5315_SYSREG_BASE +
 			AR5315_SYSREG_MISC_INTSTAT);
 	else
 		intr = ATH_READ_REG(AR5312_SYSREG_BASE +
 			AR5312_SYSREG_MISC_INTSTAT);
 
 	while ((i = fls(intr)) != 0) {
 		i--;
 		intr &= ~(1u << i);
 
 		if(i == 1 && ar531x_soc < AR531X_SOC_AR5315) {
 			ATH_READ_REG(AR5312_SYSREG_BASE +
 			    AR5312_SYSREG_AHBPERR);
 			ATH_READ_REG(AR5312_SYSREG_BASE +
 			    AR5312_SYSREG_AHBDMAE);
 		}
 
 		if (intr_isrc_dispatch(PIC_INTR_ISRC(sc, i),
 		    curthread->td_intr_frame) != 0) {
 			device_printf(sc->apb_dev,
 			    "Stray interrupt %u detected\n", i);
 			apb_mask_irq((void*)i);
 			continue;
 		}
 	}
 
 	KASSERT(i == 0, ("all interrupts handled"));
 
 	td->td_intr_nesting_level++;
 
 	return (FILTER_HANDLED);
 
 }
 
 #endif
 
 static void
 apb_hinted_child(device_t bus, const char *dname, int dunit)
 {
 	device_t		child;
 	long			maddr;
 	int			msize;
 	int			irq;
 	int			result;
 	int			mem_hints_count;
 
 	child = BUS_ADD_CHILD(bus, 0, dname, dunit);
 
 	/*
 	 * Set hard-wired resources for hinted child using
 	 * specific RIDs.
 	 */
 	mem_hints_count = 0;
 	if (resource_long_value(dname, dunit, "maddr", &maddr) == 0)
 		mem_hints_count++;
 	if (resource_int_value(dname, dunit, "msize", &msize) == 0)
 		mem_hints_count++;
 
 	/* check if all info for mem resource has been provided */
 	if ((mem_hints_count > 0) && (mem_hints_count < 2)) {
 		printf("Either maddr or msize hint is missing for %s%d\n",
 		    dname, dunit);
 	} else if (mem_hints_count) {
 		result = bus_set_resource(child, SYS_RES_MEMORY, 0,
 		    maddr, msize);
 		if (result != 0)
 			device_printf(bus, 
 			    "warning: bus_set_resource() failed\n");
 	}
 
 	if (resource_int_value(dname, dunit, "irq", &irq) == 0) {
 		result = bus_set_resource(child, SYS_RES_IRQ, 0, irq, 1);
 		if (result != 0)
 			device_printf(bus,
 			    "warning: bus_set_resource() failed\n");
 	}
 }
 
 static device_t
 apb_add_child(device_t bus, u_int order, const char *name, int unit)
 {
 	device_t		child;
 	struct apb_ivar	*ivar;
 
 	ivar = malloc(sizeof(struct apb_ivar), M_DEVBUF, M_WAITOK | M_ZERO);
 	if (ivar == NULL) {
 		printf("Failed to allocate ivar\n");
 		return (0);
 	}
 	resource_list_init(&ivar->resources);
 
 	child = device_add_child_ordered(bus, order, name, unit);
 	if (child == NULL) {
 		printf("Can't add child %s%d ordered\n", name, unit);
 		return (0);
 	}
 
 	device_set_ivars(child, ivar);
 
 	return (child);
 }
 
 /*
  * Helper routine for bus_generic_rl_get_resource/bus_generic_rl_set_resource
  * Provides pointer to resource_list for these routines
  */
 static struct resource_list *
 apb_get_resource_list(device_t dev, device_t child)
 {
 	struct apb_ivar *ivar;
 
 	ivar = device_get_ivars(child);
 	return (&(ivar->resources));
 }
 
 #ifdef INTRNG
 static void
 apb_pic_enable_intr(device_t dev, struct intr_irqsrc *isrc)
 {
 	u_int irq;
 
 	irq = ((struct apb_pic_irqsrc *)isrc)->irq;
 	apb_unmask_irq((void*)irq);
 }
 
 static void
 apb_pic_disable_intr(device_t dev, struct intr_irqsrc *isrc)
 {
 	u_int irq;
 
 	irq = ((struct apb_pic_irqsrc *)isrc)->irq;
 	apb_mask_irq((void*)irq);
 }
 
 static void
 apb_pic_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
 {
 	apb_pic_disable_intr(dev, isrc);
 }
 
 static void
 apb_pic_post_ithread(device_t dev, struct intr_irqsrc *isrc)
 {
 	apb_pic_enable_intr(dev, isrc);
 }
 
 static void
 apb_pic_post_filter(device_t dev, struct intr_irqsrc *isrc)
 {
 	uint32_t reg, irq;
 
 	irq = ((struct apb_pic_irqsrc *)isrc)->irq;
 	if(ar531x_soc >= AR531X_SOC_AR5315) {
 		reg = ATH_READ_REG(AR5315_SYSREG_BASE +
 			AR5315_SYSREG_MISC_INTSTAT);
 		ATH_WRITE_REG(AR5315_SYSREG_BASE + AR5315_SYSREG_MISC_INTSTAT,
 		    reg & ~(1 << irq));
 	} else {
 		reg = ATH_READ_REG(AR5312_SYSREG_BASE +
 			AR5312_SYSREG_MISC_INTSTAT);
 		ATH_WRITE_REG(AR5312_SYSREG_BASE + AR5312_SYSREG_MISC_INTSTAT,
 		    reg & ~(1 << irq));
 	}
 }
 
 static int
 apb_pic_map_intr(device_t dev, struct intr_map_data *data,
     struct intr_irqsrc **isrcp)
 {
 	return (ENOTSUP);
 }
 
 #endif
 
 static device_method_t apb_methods[] = {
 	DEVMETHOD(bus_activate_resource,	apb_activate_resource),
 	DEVMETHOD(bus_add_child,		apb_add_child),
 	DEVMETHOD(bus_alloc_resource,		apb_alloc_resource),
 	DEVMETHOD(bus_deactivate_resource,	apb_deactivate_resource),
 	DEVMETHOD(bus_get_resource_list,	apb_get_resource_list),
 	DEVMETHOD(bus_hinted_child,		apb_hinted_child),
 	DEVMETHOD(bus_release_resource,		apb_release_resource),
 	DEVMETHOD(device_attach,		apb_attach),
 	DEVMETHOD(device_probe,			apb_probe),
 	DEVMETHOD(bus_get_resource,		bus_generic_rl_get_resource),
 	DEVMETHOD(bus_set_resource,		bus_generic_rl_set_resource),
 #ifdef INTRNG
 	DEVMETHOD(pic_disable_intr,		apb_pic_disable_intr),
 	DEVMETHOD(pic_enable_intr,		apb_pic_enable_intr),
 	DEVMETHOD(pic_map_intr,			apb_pic_map_intr),
 	DEVMETHOD(pic_post_filter,		apb_pic_post_filter),
 	DEVMETHOD(pic_post_ithread,		apb_pic_post_ithread),
 	DEVMETHOD(pic_pre_ithread,		apb_pic_pre_ithread),
 
 //	DEVMETHOD(bus_setup_intr,		bus_generic_setup_intr),
 #else
 	DEVMETHOD(bus_teardown_intr,		apb_teardown_intr),
 #endif
 	DEVMETHOD(bus_setup_intr,		apb_setup_intr),
 
 	DEVMETHOD_END
 };
 
 static driver_t apb_driver = {
 	"apb",
 	apb_methods,
 	sizeof(struct apb_softc),
 };
 static devclass_t apb_devclass;
 
 EARLY_DRIVER_MODULE(apb, nexus, apb_driver, apb_devclass, 0, 0,
     BUS_PASS_INTERRUPT + BUS_PASS_ORDER_MIDDLE);
Index: head/sys/net/if_fddisubr.c
===================================================================
--- head/sys/net/if_fddisubr.c	(revision 314067)
+++ head/sys/net/if_fddisubr.c	(revision 314068)
@@ -1,670 +1,670 @@
 /*-
  * Copyright (c) 1995, 1996
  *	Matt Thomas <matt@3am-software.com>.  All rights reserved.
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: if_ethersubr.c,v 1.5 1994/12/13 22:31:45 wollman Exp
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llc.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 
 #include <net/ethernet.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/fddi.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 
 #ifdef DECNET
 #include <netdnet/dn.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 static const u_char fddibroadcastaddr[FDDI_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static int fddi_resolvemulti(struct ifnet *, struct sockaddr **,
 			      struct sockaddr *);
 static int fddi_output(struct ifnet *, struct mbuf *, const struct sockaddr *,
 		       struct route *); 
 static void fddi_input(struct ifnet *ifp, struct mbuf *m);
 
 #define	senderr(e)	do { error = (e); goto bad; } while (0)
 
 /*
  * FDDI output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
  * packet leaves a multiple of 512 bytes of data in remainder.
  */
 static int
 fddi_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 	u_int16_t type;
 	int loop_copy = 0, error = 0, hdrcmplt = 0;
  	u_char esrc[FDDI_ADDR_LEN], edst[FDDI_ADDR_LEN];
 	struct fddi_header *fh;
 #if defined(INET) || defined(INET6)
 	int is_gw = 0;
 #endif
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 	getmicrotime(&ifp->if_lastchange);
 
 #if defined(INET) || defined(INET6)
 	if (ro != NULL)
 		is_gw = (ro->ro_flags & RT_HAS_GW) != 0;
 #endif
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET: {
 		error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 		type = htons(ETHERTYPE_IP);
 		break;
 	}
 	case AF_ARP:
 	{
 		struct arphdr *ah;
 		ah = mtod(m, struct arphdr *);
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		loop_copy = -1; /* if this is for us, don't do it */
 
 		switch (ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			type = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			type = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (m->m_flags & M_BCAST)
 			bcopy(ifp->if_broadcastaddr, edst, FDDI_ADDR_LEN);
                 else
 			bcopy(ar_tha(ah), edst, FDDI_ADDR_LEN);
 
 	}
 	break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 		type = htons(ETHERTYPE_IPV6);
 		break;
 #endif /* INET6 */
 	case pseudo_AF_HDRCMPLT:
 	{
 		const struct ether_header *eh;
 
 		hdrcmplt = 1;
 		eh = (const struct ether_header *)dst->sa_data;
 		bcopy(eh->ether_shost, esrc, FDDI_ADDR_LEN);
 		/* FALLTHROUGH */
 	}
 
 	case AF_UNSPEC:
 	{
 		const struct ether_header *eh;
 
 		loop_copy = -1;
 		eh = (const struct ether_header *)dst->sa_data;
 		bcopy(eh->ether_dhost, edst, FDDI_ADDR_LEN);
 		if (*edst & 1)
 			m->m_flags |= (M_BCAST|M_MCAST);
 		type = eh->ether_type;
 		break;
 	}
 
 	case AF_IMPLINK:
 	{
 		fh = mtod(m, struct fddi_header *);
 		error = EPROTONOSUPPORT;
 		switch (fh->fddi_fc & (FDDIFC_C|FDDIFC_L|FDDIFC_F)) {
 			case FDDIFC_LLC_ASYNC: {
 				/* legal priorities are 0 through 7 */
 				if ((fh->fddi_fc & FDDIFC_Z) > 7)
 			        	goto bad;
 				break;
 			}
 			case FDDIFC_LLC_SYNC: {
 				/* FDDIFC_Z bits reserved, must be zero */
 				if (fh->fddi_fc & FDDIFC_Z)
 					goto bad;
 				break;
 			}
 			case FDDIFC_SMT: {
 				/* FDDIFC_Z bits must be non zero */
 				if ((fh->fddi_fc & FDDIFC_Z) == 0)
 					goto bad;
 				break;
 			}
 			default: {
 				/* anything else is too dangerous */
                	 		goto bad;
 			}
 		}
 		error = 0;
 		if (fh->fddi_dhost[0] & 1)
 			m->m_flags |= (M_BCAST|M_MCAST);
 		goto queue_it;
 	}
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		senderr(EAFNOSUPPORT);
 	}
 
 	/*
 	 * Add LLC header.
 	 */
 	if (type != 0) {
 		struct llc *l;
 		M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT);
 		if (m == NULL)
 			senderr(ENOBUFS);
 		l = mtod(m, struct llc *);
 		l->llc_control = LLC_UI;
 		l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP;
 		l->llc_snap.org_code[0] =
 			l->llc_snap.org_code[1] =
 			l->llc_snap.org_code[2] = 0;
 		l->llc_snap.ether_type = htons(type);
 	}
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 */
 	M_PREPEND(m, FDDI_HDR_LEN, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
 	fh = mtod(m, struct fddi_header *);
 	fh->fddi_fc = FDDIFC_LLC_ASYNC|FDDIFC_LLC_PRIO4;
 	bcopy((caddr_t)edst, (caddr_t)fh->fddi_dhost, FDDI_ADDR_LEN);
   queue_it:
 	if (hdrcmplt)
 		bcopy((caddr_t)esrc, (caddr_t)fh->fddi_shost, FDDI_ADDR_LEN);
 	else
 		bcopy(IF_LLADDR(ifp), (caddr_t)fh->fddi_shost,
 			FDDI_ADDR_LEN);
 
 	/*
 	 * If a simplex interface, and the packet is being sent to our
 	 * Ethernet address or a broadcast address, loopback a copy.
 	 * XXX To make a simplex device behave exactly like a duplex
 	 * device, we should copy in the case of sending to our own
 	 * ethernet address (thus letting the original actually appear
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
 	if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) {
 		if ((m->m_flags & M_BCAST) || (loop_copy > 0)) {
 			struct mbuf *n;
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			(void) if_simloop(ifp, n, dst->sa_family,
 					  FDDI_HDR_LEN);
 	     	} else if (bcmp(fh->fddi_dhost, fh->fddi_shost,
 				FDDI_ADDR_LEN) == 0) {
 			(void) if_simloop(ifp, m, dst->sa_family,
 					  FDDI_HDR_LEN);
 			return (0);	/* XXX */
 		}
 	}
 
 	error = (ifp->if_transmit)(ifp, m);
 	if (error)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 
 bad:
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	if (m)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Process a received FDDI packet.
  */
 static void
 fddi_input(ifp, m)
 	struct ifnet *ifp;
 	struct mbuf *m;
 {
 	int isr;
 	struct llc *l;
 	struct fddi_header *fh;
 
 	/*
 	 * Do consistency checks to verify assumptions
 	 * made by code past this point.
 	 */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		if_printf(ifp, "discard frame w/o packet header\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	if (m->m_pkthdr.rcvif == NULL) {
 		if_printf(ifp, "discard frame w/o interface pointer\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
         }
 
 	m = m_pullup(m, FDDI_HDR_LEN);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		goto dropanyway;
 	}
 	fh = mtod(m, struct fddi_header *);
 
 	/*
 	 * Discard packet if interface is not up.
 	 */
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		goto dropanyway;
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * Interface marked for monitoring; discard packet.
 	 */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		return;
 	}
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Update interface statistics.
 	 */
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	getmicrotime(&ifp->if_lastchange);
 
 	/*
 	 * Discard non local unicast packets when interface
 	 * is in promiscuous mode.
 	 */
 	if ((ifp->if_flags & IFF_PROMISC) && ((fh->fddi_dhost[0] & 1) == 0) &&
 	    (bcmp(IF_LLADDR(ifp), (caddr_t)fh->fddi_dhost,
 	     FDDI_ADDR_LEN) != 0))
 		goto dropanyway;
 
 	/*
 	 * Set mbuf flags for bcast/mcast.
 	 */
 	if (fh->fddi_dhost[0] & 1) {
 		if (bcmp(ifp->if_broadcastaddr, fh->fddi_dhost,
 		    FDDI_ADDR_LEN) == 0)
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 #ifdef M_LINK0
 	/*
 	 * If this has a LLC priority of 0, then mark it so upper
 	 * layers have a hint that it really came via a FDDI/Ethernet
 	 * bridge.
 	 */
 	if ((fh->fddi_fc & FDDIFC_LLC_PRIO7) == FDDIFC_LLC_PRIO0)
 		m->m_flags |= M_LINK0;
 #endif
 
 	/* Strip off FDDI header. */
 	m_adj(m, FDDI_HDR_LEN);
 
 	m = m_pullup(m, LLC_SNAPFRAMELEN);
-	if (m == 0) {
+	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		goto dropanyway;
 	}
 	l = mtod(m, struct llc *);
 
 	switch (l->llc_dsap) {
 	case LLC_SNAP_LSAP:
 	{
 		u_int16_t type;
 		if ((l->llc_control != LLC_UI) ||
 		    (l->llc_ssap != LLC_SNAP_LSAP)) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 		if (l->llc_snap.org_code[0] != 0 ||
 		    l->llc_snap.org_code[1] != 0 ||
 		    l->llc_snap.org_code[2] != 0) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 
 		type = ntohs(l->llc_snap.ether_type);
 		m_adj(m, LLC_SNAPFRAMELEN);
 
 		switch (type) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			isr = NETISR_IP;
 			break;
 
 		case ETHERTYPE_ARP:
 			if (ifp->if_flags & IFF_NOARP)
 				goto dropanyway;
 			isr = NETISR_ARP;
 			break;
 #endif
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			isr = NETISR_IPV6;
 			break;
 #endif
 #ifdef DECNET
 		case ETHERTYPE_DECNET:
 			isr = NETISR_DECNET;
 			break;
 #endif
 		default:
 			/* printf("fddi_input: unknown protocol 0x%x\n", type); */
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 		break;
 	}
 		
 	default:
 		/* printf("fddi_input: unknown dsap 0x%x\n", l->llc_dsap); */
 		if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 		goto dropanyway;
 	}
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(isr, m);
 	return;
 
 dropanyway:
 	if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	if (m)
 		m_freem(m);
 	return;
 }
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 fddi_ifattach(ifp, lla, bpf)
 	struct ifnet *ifp;
 	const u_int8_t *lla;
 	int bpf;
 {
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifp->if_type = IFT_FDDI;
 	ifp->if_addrlen = FDDI_ADDR_LEN;
 	ifp->if_hdrlen = 21;
 
 	if_attach(ifp);         /* Must be called before additional assignments */
 
 	ifp->if_mtu = FDDIMTU;
 	ifp->if_output = fddi_output;
 	ifp->if_input = fddi_input;
 	ifp->if_resolvemulti = fddi_resolvemulti;
 	ifp->if_broadcastaddr = fddibroadcastaddr;
 	ifp->if_baudrate = 100000000;
 #ifdef IFF_NOTRAILERS
 	ifp->if_flags |= IFF_NOTRAILERS;
 #endif
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_FDDI;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
 	if (bpf)
 		bpfattach(ifp, DLT_FDDI, FDDI_HDR_LEN);
 
 	return;
 }
 
 void
 fddi_ifdetach(ifp, bpf)
 	struct ifnet *ifp;
 	int bpf;
 {
      
 	if (bpf)
 		bpfdetach(ifp);
 
 	if_detach(ifp);
 
 	return;
 }
 
 int
 fddi_ioctl (ifp, command, data)
 	struct ifnet *ifp;
 	u_long command;
 	caddr_t data;
 {
 	struct ifaddr *ifa;
 	struct ifreq *ifr;
 	int error;
 
 	ifa = (struct ifaddr *) data;
 	ifr = (struct ifreq *) data;
 	error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:	/* before arpwhohas */
 			ifp->if_init(ifp->if_softc);
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 	case SIOCGIFADDR: {
 			struct sockaddr *sa;
 
 			sa = (struct sockaddr *) & ifr->ifr_data;
 			bcopy(IF_LLADDR(ifp),
 			      (caddr_t) sa->sa_data, FDDI_ADDR_LEN);
 
 		}
 		break;
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > FDDIMTU) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 fddi_resolvemulti(ifp, llsa, sa)
 	struct ifnet *ifp;
 	struct sockaddr **llsa;
 	struct sockaddr *sa;
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if ((e_addr[0] & 1) != 1)
 			return (EADDRNOTAVAIL);
 		*llsa = NULL;
 		return (0);
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return (EADDRNOTAVAIL);
 		sdl = link_init_sdl(ifp, *llsa, IFT_FDDI);
 		sdl->sdl_nlen = 0;
 		sdl->sdl_alen = FDDI_ADDR_LEN;
 		sdl->sdl_slen = 0;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = NULL;
 			return (0);
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return (EADDRNOTAVAIL);
 		sdl = link_init_sdl(ifp, *llsa, IFT_FDDI);
 		sdl->sdl_nlen = 0;
 		sdl->sdl_alen = FDDI_ADDR_LEN;
 		sdl->sdl_slen = 0;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return (EAFNOSUPPORT);
 	}
 
 	return (0);
 }
 
 static moduledata_t fddi_mod = {
 	"fddi",	/* module name */
 	NULL,	/* event handler */
 	0	/* extra data */
 };
 
 DECLARE_MODULE(fddi, fddi_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(fddi, 1);
Index: head/sys/net/if_iso88025subr.c
===================================================================
--- head/sys/net/if_iso88025subr.c	(revision 314067)
+++ head/sys/net/if_iso88025subr.c	(revision 314068)
@@ -1,697 +1,697 @@
 /*-
  * Copyright (c) 1998, Larry Lile
  * All rights reserved.
  *
  * For latest sources and information on this driver, please
  * go to http://anarchy.stdio.com.
  *
  * Questions, comments or suggestions should be directed to
  * Larry Lile <lile@stdio.com>.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 /*
  *
  * General ISO 802.5 (Token Ring) support routines
  * 
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sockio.h> 
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_llc.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 
 #include <net/ethernet.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/iso88025.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 static const u_char iso88025_broadcastaddr[ISO88025_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static int iso88025_resolvemulti (struct ifnet *, struct sockaddr **,
 				  struct sockaddr *);
 
 #define	senderr(e)	do { error = (e); goto bad; } while (0)
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 iso88025_ifattach(struct ifnet *ifp, const u_int8_t *lla, int bpf)
 {
     struct ifaddr *ifa;
     struct sockaddr_dl *sdl;
 
     ifa = NULL;
 
     ifp->if_type = IFT_ISO88025;
     ifp->if_addrlen = ISO88025_ADDR_LEN;
     ifp->if_hdrlen = ISO88025_HDR_LEN;
 
     if_attach(ifp);	/* Must be called before additional assignments */
 
     ifp->if_output = iso88025_output;
     ifp->if_input = iso88025_input;
     ifp->if_resolvemulti = iso88025_resolvemulti;
     ifp->if_broadcastaddr = iso88025_broadcastaddr;
 
     if (ifp->if_baudrate == 0)
         ifp->if_baudrate = TR_16MBPS; /* 16Mbit should be a safe default */
     if (ifp->if_mtu == 0)
         ifp->if_mtu = ISO88025_DEFAULT_MTU;
 
     ifa = ifp->if_addr;
     KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 
     sdl = (struct sockaddr_dl *)ifa->ifa_addr;
     sdl->sdl_type = IFT_ISO88025;
     sdl->sdl_alen = ifp->if_addrlen;
     bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
     if (bpf)
         bpfattach(ifp, DLT_IEEE802, ISO88025_HDR_LEN);
 
     return;
 }
 
 /*
  * Perform common duties while detaching a Token Ring interface
  */
 void
 iso88025_ifdetach(ifp, bpf)
         struct ifnet *ifp;
         int bpf;
 {
 
 	if (bpf)
                 bpfdetach(ifp);
 
 	if_detach(ifp);
 
 	return;
 }
 
 int
 iso88025_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
         struct ifaddr *ifa;
         struct ifreq *ifr;
         int error;
 
 	ifa = (struct ifaddr *) data;
 	ifr = (struct ifreq *) data;
 	error = 0;
 
         switch (command) {
         case SIOCSIFADDR:
                 ifp->if_flags |= IFF_UP;
 
                 switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
                 case AF_INET:
                         ifp->if_init(ifp->if_softc);    /* before arpwhohas */
                         arp_ifinit(ifp, ifa);
                         break;
 #endif	/* INET */
                 default:
                         ifp->if_init(ifp->if_softc);
                         break;
                 }
                 break;
 
         case SIOCGIFADDR: {
                         struct sockaddr *sa;
 
                         sa = (struct sockaddr *) & ifr->ifr_data;
                         bcopy(IF_LLADDR(ifp),
                               (caddr_t) sa->sa_data, ISO88025_ADDR_LEN);
                 }
                 break;
 
         case SIOCSIFMTU:
                 /*
                  * Set the interface MTU.
                  */
                 if (ifr->ifr_mtu > ISO88025_MAX_MTU) {
                         error = EINVAL;
                 } else {
                         ifp->if_mtu = ifr->ifr_mtu;
                 }
                 break;
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
         }
 
         return (error);
 }
 
 /*
  * ISO88025 encapsulation
  */
 int
 iso88025_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 	u_int16_t snap_type = 0;
 	int loop_copy = 0, error = 0, rif_len = 0;
 	u_char edst[ISO88025_ADDR_LEN];
 	struct iso88025_header *th;
 	struct iso88025_header gen_th;
 	struct sockaddr_dl *sdl = NULL;
 	struct rtentry *rt0 = NULL;
 	int is_gw = 0;
 
 	if (ro != NULL)
 		is_gw = (ro->ro_flags & RT_HAS_GW) != 0;
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 	getmicrotime(&ifp->if_lastchange);
 
 	/* Calculate routing info length based on arp table entry */
 	/* XXX any better way to do this ? */
 
 	if (rt0 && (sdl = (struct sockaddr_dl *)rt0->rt_gateway))
 		if (SDL_ISO88025(sdl)->trld_rcf != 0)
 			rif_len = TR_RCF_RIFLEN(SDL_ISO88025(sdl)->trld_rcf);
 
 	/* Generate a generic 802.5 header for the packet */
 	gen_th.ac = TR_AC;
 	gen_th.fc = TR_LLC_FRAME;
 	(void)memcpy((caddr_t)gen_th.iso88025_shost, IF_LLADDR(ifp),
 		     ISO88025_ADDR_LEN);
 	if (rif_len) {
 		gen_th.iso88025_shost[0] |= TR_RII;
 		if (rif_len > 2) {
 			gen_th.rcf = SDL_ISO88025(sdl)->trld_rcf;
 			(void)memcpy((caddr_t)gen_th.rd,
 				(caddr_t)SDL_ISO88025(sdl)->trld_route,
 				rif_len - 2);
 		}
 	}
 	
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 		snap_type = ETHERTYPE_IP;
 		break;
 	case AF_ARP:
 	{
 		struct arphdr *ah;
 		ah = mtod(m, struct arphdr *);
 		ah->ar_hrd = htons(ARPHRD_IEEE802);
 
 		loop_copy = -1; /* if this is for us, don't do it */
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			snap_type = ETHERTYPE_REVARP;
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			snap_type = ETHERTYPE_ARP;
 			break;
 		}
 
 		if (m->m_flags & M_BCAST)
 			bcopy(ifp->if_broadcastaddr, edst, ISO88025_ADDR_LEN);
 		else
 			bcopy(ar_tha(ah), edst, ISO88025_ADDR_LEN);
 
 	}
 	break;
 #endif	/* INET */
 #ifdef INET6
 	case AF_INET6:
 		error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 		snap_type = ETHERTYPE_IPV6;
 		break;
 #endif	/* INET6 */
 	case AF_UNSPEC:
 	{
 		const struct iso88025_sockaddr_data *sd;
 		/*
 		 * For AF_UNSPEC sockaddr.sa_data must contain all of the
 		 * mac information needed to send the packet.  This allows
 		 * full mac, llc, and source routing function to be controlled.
 		 * llc and source routing information must already be in the
 		 * mbuf provided, ac/fc are set in sa_data.  sockaddr.sa_data
 		 * should be an iso88025_sockaddr_data structure see iso88025.h
 		 */
                 loop_copy = -1;
 		sd = (const struct iso88025_sockaddr_data *)dst->sa_data;
 		gen_th.ac = sd->ac;
 		gen_th.fc = sd->fc;
 		(void)memcpy(edst, sd->ether_dhost, ISO88025_ADDR_LEN);
 		(void)memcpy(gen_th.iso88025_shost, sd->ether_shost,
 		    ISO88025_ADDR_LEN);
 		rif_len = 0;
 		break;
 	}
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		senderr(EAFNOSUPPORT);
 		break;
 	}
 
 	/*
 	 * Add LLC header.
 	 */
 	if (snap_type != 0) {
         	struct llc *l;
 		M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT);
 		if (m == NULL)
 			senderr(ENOBUFS);
 		l = mtod(m, struct llc *);
 		l->llc_control = LLC_UI;
 		l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP;
 		l->llc_snap.org_code[0] =
 			l->llc_snap.org_code[1] =
 			l->llc_snap.org_code[2] = 0;
 		l->llc_snap.ether_type = htons(snap_type);
 	}
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 */
 	M_PREPEND(m, ISO88025_HDR_LEN + rif_len, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
 	th = mtod(m, struct iso88025_header *);
 	bcopy((caddr_t)edst, (caddr_t)&gen_th.iso88025_dhost, ISO88025_ADDR_LEN);
 
 	/* Copy as much of the generic header as is needed into the mbuf */
 	memcpy(th, &gen_th, ISO88025_HDR_LEN + rif_len);
 
         /*
          * If a simplex interface, and the packet is being sent to our
          * Ethernet address or a broadcast address, loopback a copy.
          * XXX To make a simplex device behave exactly like a duplex
          * device, we should copy in the case of sending to our own
          * ethernet address (thus letting the original actually appear
          * on the wire). However, we don't do that here for security
          * reasons and compatibility with the original behavior.
          */     
         if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) {
                 if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { 
                         struct mbuf *n;
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
                         (void) if_simloop(ifp, n, dst->sa_family,
 					  ISO88025_HDR_LEN);
                 } else if (bcmp(th->iso88025_dhost, th->iso88025_shost,
 				 ETHER_ADDR_LEN) == 0) {
 			(void) if_simloop(ifp, m, dst->sa_family,
 					  ISO88025_HDR_LEN);
                        	return(0);      /* XXX */
 		}       
         }      
 
 	IFQ_HANDOFF_ADJ(ifp, m, ISO88025_HDR_LEN + LLC_SNAPFRAMELEN, error);
 	if (error) {
 		printf("iso88025_output: packet dropped QFULL.\n");
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	}
 	return (error);
 
 bad:
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	if (m)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * ISO 88025 de-encapsulation
  */
 void
 iso88025_input(ifp, m)
 	struct ifnet *ifp;
 	struct mbuf *m;
 {
 	struct iso88025_header *th;
 	struct llc *l;
 	int isr;
 	int mac_hdr_len;
 
 	/*
 	 * Do consistency checks to verify assumptions
 	 * made by code past this point.
 	 */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		if_printf(ifp, "discard frame w/o packet header\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	if (m->m_pkthdr.rcvif == NULL) {
 		if_printf(ifp, "discard frame w/o interface pointer\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
  		m_freem(m);
 		return;
 	}
 
 	m = m_pullup(m, ISO88025_HDR_LEN);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		goto dropanyway;
 	}
 	th = mtod(m, struct iso88025_header *);
 
 	/*
 	 * Discard packet if interface is not up.
 	 */
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		goto dropanyway;
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * Interface marked for monitoring; discard packet.
 	 */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		return;
 	}
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Update interface statistics.
 	 */
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	getmicrotime(&ifp->if_lastchange);
 
 	/*
 	 * Discard non local unicast packets when interface
 	 * is in promiscuous mode.
 	 */
 	if ((ifp->if_flags & IFF_PROMISC) &&
 	    ((th->iso88025_dhost[0] & 1) == 0) &&
 	     (bcmp(IF_LLADDR(ifp), (caddr_t) th->iso88025_dhost,
 	     ISO88025_ADDR_LEN) != 0))
 		goto dropanyway;
 
 	/*
 	 * Set mbuf flags for bcast/mcast.
 	 */
 	if (th->iso88025_dhost[0] & 1) {
 		if (bcmp(iso88025_broadcastaddr, th->iso88025_dhost,
 		    ISO88025_ADDR_LEN) == 0)
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 	mac_hdr_len = ISO88025_HDR_LEN;
 	/* Check for source routing info */
 	if (th->iso88025_shost[0] & TR_RII)
 		mac_hdr_len += TR_RCF_RIFLEN(th->rcf);
 
 	/* Strip off ISO88025 header. */
 	m_adj(m, mac_hdr_len);
 
 	m = m_pullup(m, LLC_SNAPFRAMELEN);
-	if (m == 0) {
+	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		goto dropanyway;
 	}
 	l = mtod(m, struct llc *);
 
 	switch (l->llc_dsap) {
 	case LLC_SNAP_LSAP: {
 		u_int16_t type;
 		if ((l->llc_control != LLC_UI) ||
 		    (l->llc_ssap != LLC_SNAP_LSAP)) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 
 		if (l->llc_snap.org_code[0] != 0 ||
 		    l->llc_snap.org_code[1] != 0 ||
 		    l->llc_snap.org_code[2] != 0) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 
 		type = ntohs(l->llc_snap.ether_type);
 		m_adj(m, LLC_SNAPFRAMELEN);
 		switch (type) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			th->iso88025_shost[0] &= ~(TR_RII); 
 			isr = NETISR_IP;
 			break;
 
 		case ETHERTYPE_ARP:
 			if (ifp->if_flags & IFF_NOARP)
 				goto dropanyway;
 			isr = NETISR_ARP;
 			break;
 #endif	/* INET */
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			th->iso88025_shost[0] &= ~(TR_RII); 
 			isr = NETISR_IPV6;
 			break;
 #endif	/* INET6 */
 		default:
 			printf("iso88025_input: unexpected llc_snap ether_type  0x%02x\n", type);
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 		break;
 	}
 #ifdef ISO
 	case LLC_ISO_LSAP:
 		switch (l->llc_control) {
 		case LLC_UI:
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 			break;
                 case LLC_XID:
                 case LLC_XID_P:
 			if(m->m_len < ISO88025_ADDR_LEN)
 				goto dropanyway;
 			l->llc_window = 0;
 			l->llc_fid = 9;  
 			l->llc_class = 1;
 			l->llc_dsap = l->llc_ssap = 0;
 			/* Fall through to */  
 		case LLC_TEST:
 		case LLC_TEST_P:
 		{
 			struct sockaddr sa;
 			struct iso88025_sockaddr_data *th2;
 			int i;
 			u_char c;
 
 			c = l->llc_dsap;
 
 			if (th->iso88025_shost[0] & TR_RII) { /* XXX */
 				printf("iso88025_input: dropping source routed LLC_TEST\n");
 				goto dropanyway;
 			}
 			l->llc_dsap = l->llc_ssap;
 			l->llc_ssap = c;
 			if (m->m_flags & (M_BCAST | M_MCAST))
 				bcopy((caddr_t)IF_LLADDR(ifp),
 				      (caddr_t)th->iso88025_dhost,
 					ISO88025_ADDR_LEN);
 			sa.sa_family = AF_UNSPEC;
 			sa.sa_len = sizeof(sa);
 			th2 = (struct iso88025_sockaddr_data *)sa.sa_data;
 			for (i = 0; i < ISO88025_ADDR_LEN; i++) {
 				th2->ether_shost[i] = c = th->iso88025_dhost[i];
 				th2->ether_dhost[i] = th->iso88025_dhost[i] =
 					th->iso88025_shost[i];
 				th->iso88025_shost[i] = c;
 			}
 			th2->ac = TR_AC;
 			th2->fc = TR_LLC_FRAME;
 			ifp->if_output(ifp, m, &sa, NULL);
 			return;
 		}
 		default:
 			printf("iso88025_input: unexpected llc control 0x%02x\n", l->llc_control);
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 			break;
 		}
 		break;
 #endif	/* ISO */
 	default:
 		printf("iso88025_input: unknown dsap 0x%x\n", l->llc_dsap);
 		if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 		goto dropanyway;
 		break;
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(isr, m);
 	return;
 
 dropanyway:
 	if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	if (m)
 		m_freem(m);
 	return;
 }
 
 static int
 iso88025_resolvemulti (ifp, llsa, sa)
 	struct ifnet *ifp;
 	struct sockaddr **llsa;
 	struct sockaddr *sa;
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if ((e_addr[0] & 1) != 1) {
 			return (EADDRNOTAVAIL);
 		}
 		*llsa = NULL;
 		return (0);
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			return (EADDRNOTAVAIL);
 		}
 		sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025);
 		sdl->sdl_alen = ISO88025_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = NULL;
 			return (0);
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 			return (EADDRNOTAVAIL);
 		}
 		sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025);
 		sdl->sdl_alen = ISO88025_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return (EAFNOSUPPORT);
 	}
 
 	return (0);
 }
 
 static moduledata_t iso88025_mod = {
 	.name = "iso88025",
 };
 
 DECLARE_MODULE(iso88025, iso88025_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(iso88025, 1);
Index: head/sys/net/iflib.c
===================================================================
--- head/sys/net/iflib.c	(revision 314067)
+++ head/sys/net/iflib.c	(revision 314068)
@@ -1,5244 +1,5244 @@
 /*-
  * Copyright (c) 2014-2017, Matthew Macy <mmacy@nextbsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of Matthew Macy nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_acpi.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/sockio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/kobj.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/limits.h>
 
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/mp_ring.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/led/led.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 
 #include <net/iflib.h>
 
 #include "ifdi_if.h"
 
 #if defined(__i386__) || defined(__amd64__)
 #include <sys/memdesc.h>
 #include <machine/bus.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/include/busdma_impl.h>
 #include <x86/iommu/busdma_dmar.h>
 #endif
 
 /*
  * enable accounting of every mbuf as it comes in to and goes out of iflib's software descriptor references
  */
 #define MEMORY_LOGGING 0
 /*
  * Enable mbuf vectors for compressing long mbuf chains
  */
 
 /*
  * NB:
  * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
  *   we prefetch needs to be determined by the time spent in m_free vis a vis
  *   the cost of a prefetch. This will of course vary based on the workload:
  *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
  *        is quite expensive, thus suggesting very little prefetch.
  *      - small packet forwarding which is just returning a single mbuf to
  *        UMA will typically be very fast vis a vis the cost of a memory
  *        access.
  */
 
 
 /*
  * File organization:
  *  - private structures
  *  - iflib private utility functions
  *  - ifnet functions
  *  - vlan registry and other exported functions
  *  - iflib public core functions
  *
  *
  */
 static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
 
 struct iflib_txq;
 typedef struct iflib_txq *iflib_txq_t;
 struct iflib_rxq;
 typedef struct iflib_rxq *iflib_rxq_t;
 struct iflib_fl;
 typedef struct iflib_fl *iflib_fl_t;
 
 struct iflib_ctx;
 
 typedef struct iflib_filter_info {
 	driver_filter_t *ifi_filter;
 	void *ifi_filter_arg;
 	struct grouptask *ifi_task;
 	struct iflib_ctx *ifi_ctx;
 } *iflib_filter_info_t;
 
 struct iflib_ctx {
 	KOBJ_FIELDS;
    /*
    * Pointer to hardware driver's softc
    */
 	void *ifc_softc;
 	device_t ifc_dev;
 	if_t ifc_ifp;
 
 	cpuset_t ifc_cpus;
 	if_shared_ctx_t ifc_sctx;
 	struct if_softc_ctx ifc_softc_ctx;
 
 	struct mtx ifc_mtx;
 
 	uint16_t ifc_nhwtxqs;
 	uint16_t ifc_nhwrxqs;
 
 	iflib_txq_t ifc_txqs;
 	iflib_rxq_t ifc_rxqs;
 	uint32_t ifc_if_flags;
 	uint32_t ifc_flags;
 	uint32_t ifc_max_fl_buf_size;
 	int ifc_in_detach;
 
 	int ifc_link_state;
 	int ifc_link_irq;
 	int ifc_pause_frames;
 	int ifc_watchdog_events;
 	struct cdev *ifc_led_dev;
 	struct resource *ifc_msix_mem;
 
 	struct if_irq ifc_legacy_irq;
 	struct grouptask ifc_admin_task;
 	struct grouptask ifc_vflr_task;
 	struct iflib_filter_info ifc_filter_info;
 	struct ifmedia	ifc_media;
 
 	struct sysctl_oid *ifc_sysctl_node;
 	uint16_t ifc_sysctl_ntxqs;
 	uint16_t ifc_sysctl_nrxqs;
 	uint16_t ifc_sysctl_qs_eq_override;
 
 	uint16_t ifc_sysctl_ntxds[8];
 	uint16_t ifc_sysctl_nrxds[8];
 	struct if_txrx ifc_txrx;
 #define isc_txd_encap  ifc_txrx.ift_txd_encap
 #define isc_txd_flush  ifc_txrx.ift_txd_flush
 #define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
 #define isc_rxd_available ifc_txrx.ift_rxd_available
 #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
 	eventhandler_tag ifc_vlan_attach_event;
 	eventhandler_tag ifc_vlan_detach_event;
 	uint8_t ifc_mac[ETHER_ADDR_LEN];
 	char ifc_mtx_name[16];
 };
 
 
 void *
 iflib_get_softc(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_softc);
 }
 
 device_t
 iflib_get_dev(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_dev);
 }
 
 if_t
 iflib_get_ifp(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_ifp);
 }
 
 struct ifmedia *
 iflib_get_media(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_media);
 }
 
 void
 iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
 {
 
 	bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN);
 }
 
 if_softc_ctx_t
 iflib_get_softc_ctx(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_softc_ctx);
 }
 
 if_shared_ctx_t
 iflib_get_sctx(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_sctx);
 }
 
 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
 
 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
 
 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
 #define RX_SW_DESC_INUSE        (1 << 3)
 #define TX_SW_DESC_MAPPED       (1 << 4)
 
 typedef struct iflib_sw_rx_desc_array {
 	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
 	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
 	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
 	uint8_t		*ifsd_flags;
 } iflib_rxsd_array_t;
 
 typedef struct iflib_sw_tx_desc_array {
 	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
 	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
 	uint8_t		*ifsd_flags;
 } iflib_txsd_array_t;
 
 
 /* magic number that should be high enough for any hardware */
 #define IFLIB_MAX_TX_SEGS		128
 #define IFLIB_MAX_RX_SEGS		32
 #define IFLIB_RX_COPY_THRESH		63
 #define IFLIB_MAX_RX_REFRESH		32
 #define IFLIB_QUEUE_IDLE		0
 #define IFLIB_QUEUE_HUNG		1
 #define IFLIB_QUEUE_WORKING		2
 
 /* this should really scale with ring size - 32 is a fairly arbitrary value for this */
 #define TX_BATCH_SIZE			16
 
 #define IFLIB_RESTART_BUDGET		8
 
 #define	IFC_LEGACY		0x01
 #define	IFC_QFLUSH		0x02
 #define	IFC_MULTISEG		0x04
 #define	IFC_DMAR		0x08
 #define	IFC_SC_ALLOCATED	0x10
 #define	IFC_INIT_DONE		0x20
 
 
 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
 struct iflib_txq {
 	uint16_t	ift_in_use;
 	uint16_t	ift_cidx;
 	uint16_t	ift_cidx_processed;
 	uint16_t	ift_pidx;
 	uint8_t		ift_gen;
 	uint8_t		ift_db_pending;
 	uint8_t		ift_db_pending_queued;
 	uint8_t		ift_npending;
 	uint8_t		ift_br_offset;
 	/* implicit pad */
 	uint64_t	ift_processed;
 	uint64_t	ift_cleaned;
 #if MEMORY_LOGGING
 	uint64_t	ift_enqueued;
 	uint64_t	ift_dequeued;
 #endif
 	uint64_t	ift_no_tx_dma_setup;
 	uint64_t	ift_no_desc_avail;
 	uint64_t	ift_mbuf_defrag_failed;
 	uint64_t	ift_mbuf_defrag;
 	uint64_t	ift_map_failed;
 	uint64_t	ift_txd_encap_efbig;
 	uint64_t	ift_pullups;
 
 	struct mtx	ift_mtx;
 	struct mtx	ift_db_mtx;
 
 	/* constant values */
 	if_ctx_t	ift_ctx;
 	struct ifmp_ring        **ift_br;
 	struct grouptask	ift_task;
 	uint16_t	ift_size;
 	uint16_t	ift_id;
 	struct callout	ift_timer;
 	struct callout	ift_db_check;
 
 	iflib_txsd_array_t	ift_sds;
 	uint8_t			ift_nbr;
 	uint8_t			ift_qstatus;
 	uint8_t			ift_active;
 	uint8_t			ift_closed;
 	int			ift_watchdog_time;
 	struct iflib_filter_info ift_filter_info;
 	bus_dma_tag_t		ift_desc_tag;
 	bus_dma_tag_t		ift_tso_desc_tag;
 	iflib_dma_info_t	ift_ifdi;
 #define MTX_NAME_LEN 16
 	char                    ift_mtx_name[MTX_NAME_LEN];
 	char                    ift_db_mtx_name[MTX_NAME_LEN];
 	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ift_cpu_exec_count[256];
 #endif
 } __aligned(CACHE_LINE_SIZE);
 
 struct iflib_fl {
 	uint16_t	ifl_cidx;
 	uint16_t	ifl_pidx;
 	uint16_t	ifl_credits;
 	uint8_t		ifl_gen;
 #if MEMORY_LOGGING
 	uint64_t	ifl_m_enqueued;
 	uint64_t	ifl_m_dequeued;
 	uint64_t	ifl_cl_enqueued;
 	uint64_t	ifl_cl_dequeued;
 #endif
 	/* implicit pad */
 
 	/* constant */
 	uint16_t	ifl_size;
 	uint16_t	ifl_buf_size;
 	uint16_t	ifl_cltype;
 	uma_zone_t	ifl_zone;
 	iflib_rxsd_array_t	ifl_sds;
 	iflib_rxq_t	ifl_rxq;
 	uint8_t		ifl_id;
 	bus_dma_tag_t           ifl_desc_tag;
 	iflib_dma_info_t	ifl_ifdi;
 	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
 	caddr_t		ifl_vm_addrs[IFLIB_MAX_RX_REFRESH];
 }  __aligned(CACHE_LINE_SIZE);
 
 static inline int
 get_inuse(int size, int cidx, int pidx, int gen)
 {
 	int used;
 
 	if (pidx > cidx)
 		used = pidx - cidx;
 	else if (pidx < cidx)
 		used = size - cidx + pidx;
 	else if (gen == 0 && pidx == cidx)
 		used = 0;
 	else if (gen == 1 && pidx == cidx)
 		used = size;
 	else
 		panic("bad state");
 
 	return (used);
 }
 
 #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
 
 #define IDXDIFF(head, tail, wrap) \
 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
 
 struct iflib_rxq {
 	/* If there is a separate completion queue -
 	 * these are the cq cidx and pidx. Otherwise
 	 * these are unused.
 	 */
 	uint16_t	ifr_size;
 	uint16_t	ifr_cq_cidx;
 	uint16_t	ifr_cq_pidx;
 	uint8_t		ifr_cq_gen;
 	uint8_t		ifr_fl_offset;
 
 	if_ctx_t	ifr_ctx;
 	iflib_fl_t	ifr_fl;
 	uint64_t	ifr_rx_irq;
 	uint16_t	ifr_id;
 	uint8_t		ifr_lro_enabled;
 	uint8_t		ifr_nfl;
 	struct lro_ctrl			ifr_lc;
 	struct grouptask        ifr_task;
 	struct iflib_filter_info ifr_filter_info;
 	iflib_dma_info_t		ifr_ifdi;
 	/* dynamically allocate if any drivers need a value substantially larger than this */
 	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ifr_cpu_exec_count[256];
 #endif
 }  __aligned(CACHE_LINE_SIZE);
 
 /*
  * Only allow a single packet to take up most 1/nth of the tx ring
  */
 #define MAX_SINGLE_PACKET_FRACTION 12
 #define IF_BAD_DMA (bus_addr_t)-1
 
 static int enable_msix = 1;
 
 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
 
 #define CTX_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_mtx, _name, "iflib ctx lock", MTX_DEF)
 
 #define CTX_LOCK(ctx) mtx_lock(&(ctx)->ifc_mtx)
 #define CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_mtx)
 #define CTX_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_mtx)
 
 
 #define TXDB_LOCK_INIT(txq)  mtx_init(&(txq)->ift_db_mtx, (txq)->ift_db_mtx_name, NULL, MTX_DEF)
 #define TXDB_TRYLOCK(txq) mtx_trylock(&(txq)->ift_db_mtx)
 #define TXDB_LOCK(txq) mtx_lock(&(txq)->ift_db_mtx)
 #define TXDB_UNLOCK(txq) mtx_unlock(&(txq)->ift_db_mtx)
 #define TXDB_LOCK_DESTROY(txq) mtx_destroy(&(txq)->ift_db_mtx)
 
 #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
 #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
 
 
 /* Our boot-time initialization hook */
 static int	iflib_module_event_handler(module_t, int, void *);
 
 static moduledata_t iflib_moduledata = {
 	"iflib",
 	iflib_module_event_handler,
 	NULL
 };
 
 DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(iflib, 1);
 
 MODULE_DEPEND(iflib, pci, 1, 1, 1);
 MODULE_DEPEND(iflib, ether, 1, 1, 1);
 
 TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
 
 #ifndef IFLIB_DEBUG_COUNTERS
 #ifdef INVARIANTS
 #define IFLIB_DEBUG_COUNTERS 1
 #else
 #define IFLIB_DEBUG_COUNTERS 0
 #endif /* !INVARIANTS */
 #endif
 
 static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0,
                    "iflib driver parameters");
 
 /*
  * XXX need to ensure that this can't accidentally cause the head to be moved backwards 
  */
 static int iflib_min_tx_latency = 0;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
 		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
 
 
 #if IFLIB_DEBUG_COUNTERS
 
 static int iflib_tx_seen;
 static int iflib_tx_sent;
 static int iflib_tx_encap;
 static int iflib_rx_allocs;
 static int iflib_fl_refills;
 static int iflib_fl_refills_large;
 static int iflib_tx_frees;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
 		   &iflib_tx_seen, 0, "# tx mbufs seen");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
 		   &iflib_tx_sent, 0, "# tx mbufs sent");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
 		   &iflib_tx_encap, 0, "# tx mbufs encapped");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
 		   &iflib_tx_frees, 0, "# tx frees");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
 		   &iflib_rx_allocs, 0, "# rx allocations");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
 		   &iflib_fl_refills, 0, "# refills");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
 		   &iflib_fl_refills_large, 0, "# large refills");
 
 
 static int iflib_txq_drain_flushing;
 static int iflib_txq_drain_oactive;
 static int iflib_txq_drain_notready;
 static int iflib_txq_drain_encapfail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
 		   &iflib_txq_drain_flushing, 0, "# drain flushes");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
 		   &iflib_txq_drain_oactive, 0, "# drain oactives");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
 		   &iflib_txq_drain_notready, 0, "# drain notready");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_encapfail, CTLFLAG_RD,
 		   &iflib_txq_drain_encapfail, 0, "# drain encap fails");
 
 
 static int iflib_encap_load_mbuf_fail;
 static int iflib_encap_txq_avail_fail;
 static int iflib_encap_txd_encap_fail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
 		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
 		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
 
 static int iflib_task_fn_rxs;
 static int iflib_rx_intr_enables;
 static int iflib_fast_intrs;
 static int iflib_intr_link;
 static int iflib_intr_msix; 
 static int iflib_rx_unavail;
 static int iflib_rx_ctx_inactive;
 static int iflib_rx_zero_len;
 static int iflib_rx_if_input;
 static int iflib_rx_mbuf_null;
 static int iflib_rxd_flush;
 
 static int iflib_verbose_debug;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, intr_link, CTLFLAG_RD,
 		   &iflib_intr_link, 0, "# intr link calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, intr_msix, CTLFLAG_RD,
 		   &iflib_intr_msix, 0, "# intr msix calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
 		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
 		   &iflib_rx_intr_enables, 0, "# rx intr enables");
 SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
 		   &iflib_fast_intrs, 0, "# fast_intr calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
 		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
 		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_zero_len, CTLFLAG_RD,
 		   &iflib_rx_zero_len, 0, "# times rxeof saw zero len mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
 		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
 		   &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
 	         &iflib_rxd_flush, 0, "# times rxd_flush called");
 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
 		   &iflib_verbose_debug, 0, "enable verbose debugging");
 
 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
 static void
 iflib_debug_reset(void)
 {
 	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
 		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
 		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
 		iflib_txq_drain_notready = iflib_txq_drain_encapfail =
 		iflib_encap_load_mbuf_fail = iflib_encap_txq_avail_fail =
 		iflib_encap_txd_encap_fail = iflib_task_fn_rxs = iflib_rx_intr_enables =
 		iflib_fast_intrs = iflib_intr_link = iflib_intr_msix = iflib_rx_unavail =
 		iflib_rx_ctx_inactive = iflib_rx_zero_len = iflib_rx_if_input =
 		iflib_rx_mbuf_null = iflib_rxd_flush = 0;
 }
 
 #else
 #define DBG_COUNTER_INC(name)
 static void iflib_debug_reset(void) {}
 #endif
 
 
 
 #define IFLIB_DEBUG 0
 
 static void iflib_tx_structures_free(if_ctx_t ctx);
 static void iflib_rx_structures_free(if_ctx_t ctx);
 static int iflib_queues_alloc(if_ctx_t ctx);
 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
 static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget);
 static int iflib_qset_structures_setup(if_ctx_t ctx);
 static int iflib_msix_init(if_ctx_t ctx);
 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, char *str);
 static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
 static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
 static int iflib_register(if_ctx_t);
 static void iflib_init_locked(if_ctx_t ctx);
 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
 static void iflib_ifmp_purge(iflib_txq_t txq);
 static void _iflib_pre_assert(if_softc_ctx_t scctx);
 
 #ifdef DEV_NETMAP
 #include <sys/selinfo.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 
 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
 
 /*
  * device-specific sysctl variables:
  *
  * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
  *	During regular operations the CRC is stripped, but on some
  *	hardware reception of frames not multiple of 64 is slower,
  *	so using crcstrip=0 helps in benchmarks.
  *
  * iflib_rx_miss, iflib_rx_miss_bufs:
  *	count packets that might be missed due to lost interrupts.
  */
 SYSCTL_DECL(_dev_netmap);
 /*
  * The xl driver by default strips CRCs and we do not override it.
  */
 
 int iflib_crcstrip = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
     CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on rx frames");
 
 int iflib_rx_miss, iflib_rx_miss_bufs;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
     CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed rx intr");
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
     CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed rx intr bufs");
 
 /*
  * Register/unregister. We are already under netmap lock.
  * Only called on the first register or the last unregister.
  */
 static int
 iflib_netmap_register(struct netmap_adapter *na, int onoff)
 {
 	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
 
 	CTX_LOCK(ctx);
 	IFDI_INTR_DISABLE(ctx);
 
 	/* Tell the stack that the interface is no longer active */
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 
 	if (!CTX_IS_VF(ctx))
 		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
 
 	/* enable or disable flags and callbacks in na and ifp */
 	if (onoff) {
 		nm_set_native_flags(na);
 	} else {
 		nm_clear_native_flags(na);
 	}
 	IFDI_INIT(ctx);
 	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
 	CTX_UNLOCK(ctx);
 	return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1);
 }
 
 /*
  * Reconcile kernel and user view of the transmit ring.
  *
  * All information is in the kring.
  * Userspace wants to send packets up to the one before kring->rhead,
  * kernel knows kring->nr_hwcur is the first unsent packet.
  *
  * Here we push packets out (as many as possible), and possibly
  * reclaim buffers from previously completed transmission.
  *
  * The caller (netmap) guarantees that there is only one instance
  * running at any time. Any interference with other driver
  * methods should be handled by the individual drivers.
  */
 static int
 iflib_netmap_txsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
 	u_int n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	struct if_pkt_info pi;
 
 	/*
 	 * interrupts on every tx packet are expensive so request
 	 * them every half ring, or where NS_REPORT is set
 	 */
 	u_int report_frequency = kring->nkr_num_slots >> 1;
 	/* device-specific */
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
 
 	pi.ipi_segs = txq->ift_segs;
 	pi.ipi_qsidx = kring->ring_id;
 	pi.ipi_ndescs = 0;
 
 	bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
 					BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 
 	/*
 	 * First part: process new packets to send.
 	 * nm_i is the current index in the netmap ring,
 	 * nic_i is the corresponding index in the NIC ring.
 	 *
 	 * If we have packets to send (nm_i != head)
 	 * iterate over the netmap ring, fetch length and update
 	 * the corresponding slot in the NIC ring. Some drivers also
 	 * need to update the buffer's physical address in the NIC slot
 	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
 	 *
 	 * The netmap_reload_map() calls is especially expensive,
 	 * even when (as in this case) the tag is 0, so do only
 	 * when the buffer has actually changed.
 	 *
 	 * If possible do not set the report/intr bit on all slots,
 	 * but only a few times per ring or when NS_REPORT is set.
 	 *
 	 * Finally, on 10G and faster drivers, it might be useful
 	 * to prefetch the next slot and txr entry.
 	 */
 
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {	/* we have new packets to send */
 		nic_i = netmap_idx_k2n(kring, nm_i);
 
 		__builtin_prefetch(&ring->slot[nm_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
 
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
 			void *addr = PNMB(na, slot, &paddr);
 			int flags = (slot->flags & NS_REPORT ||
 				nic_i == 0 || nic_i == report_frequency) ?
 				IPI_TX_INTR : 0;
 
 			/* device-specific */
 			pi.ipi_pidx = nic_i;
 			pi.ipi_flags = flags;
 
 			/* Fill the slot in the NIC ring. */
 			ctx->isc_txd_encap(ctx->ifc_softc, &pi);
 
 			/* prefetch for next round */
 			__builtin_prefetch(&ring->slot[nm_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
 
 			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
 			/* make sure changes to the buffer are synced */
 			bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i],
 							BUS_DMASYNC_PREWRITE);
 
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = head;
 
 		/* synchronize the NIC ring */
 		bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		/* (re)start the tx unit up to slot nic_i (excluded) */
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
 	}
 
 	/*
 	 * Second part: reclaim buffers for completed transmissions.
 	 */
 	if (iflib_tx_credits_update(ctx, txq)) {
 		/* some tx completed, increment avail */
 		nic_i = txq->ift_cidx_processed;
 		kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
 	}
 	return (0);
 }
 
 /*
  * Reconcile kernel and user view of the receive ring.
  * Same as for the txsync, this routine must be efficient.
  * The caller guarantees a single invocations, but races against
  * the rest of the driver should be handled here.
  *
  * On call, kring->rhead is the first packet that userspace wants
  * to keep, and kring->rcur is the wakeup point.
  * The kernel has previously reported packets up to kring->rtail.
  *
  * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
  * of whether or not we received an interrupt.
  */
 static int
 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
 	u_int i, n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 	struct if_rxd_info ri;
 	/* device-specific */
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
 	iflib_fl_t fl = rxq->ifr_fl;
 	if (head > lim)
 		return netmap_ring_reinit(kring);
 
 	bzero(&ri, sizeof(ri));
 	ri.iri_qsidx = kring->ring_id;
 	ri.iri_ifp = ctx->ifc_ifp;
 	/* XXX check sync modes */
 	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++)
 		bus_dmamap_sync(rxq->ifr_fl[i].ifl_desc_tag, fl->ifl_ifdi->idi_map,
 				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/*
 	 * First part: import newly received packets.
 	 *
 	 * nm_i is the index of the next free slot in the netmap ring,
 	 * nic_i is the index of the next received packet in the NIC ring,
 	 * and they may differ in case if_init() has been called while
 	 * in netmap mode. For the receive ring we have
 	 *
 	 *	nic_i = rxr->next_check;
 	 *	nm_i = kring->nr_hwtail (previous)
 	 * and
 	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 *
 	 * rxr->next_check is set to 0 on a ring reinit
 	 */
 	if (netmap_no_pendintr || force_update) {
 		int crclen = iflib_crcstrip ? 0 : 4;
 		int error, avail;
 		uint16_t slot_flags = kring->nkr_slot_flags;
 
 		for (fl = rxq->ifr_fl, i = 0; i < rxq->ifr_nfl; i++, fl++) {
 			nic_i = fl->ifl_cidx;
 			nm_i = netmap_idx_n2k(kring, nic_i);
 			avail = ctx->isc_rxd_available(ctx->ifc_softc, kring->ring_id, nic_i, INT_MAX);
 			for (n = 0; avail > 0; n++, avail--) {
 				error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 				if (error)
 					ring->slot[nm_i].len = 0;
 				else
 					ring->slot[nm_i].len = ri.iri_len - crclen;
 				ring->slot[nm_i].flags = slot_flags;
 				bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
 								fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
 				nm_i = nm_next(nm_i, lim);
 				nic_i = nm_next(nic_i, lim);
 			}
 			if (n) { /* update the state variables */
 				if (netmap_no_pendintr && !force_update) {
 					/* diagnostics */
 					iflib_rx_miss ++;
 					iflib_rx_miss_bufs += n;
 				}
 				fl->ifl_cidx = nic_i;
 				kring->nr_hwtail = nm_i;
 			}
 			kring->nr_kflags &= ~NKR_PENDINTR;
 		}
 	}
 	/*
 	 * Second part: skip past packets that userspace has released.
 	 * (kring->nr_hwcur to head excluded),
 	 * and make the buffers available for reception.
 	 * As usual nm_i is the index in the netmap ring,
 	 * nic_i is the index in the NIC ring, and
 	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 */
 	/* XXX not sure how this will work with multiple free lists */
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {
 		nic_i = netmap_idx_k2n(kring, nm_i);
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
 			caddr_t vaddr;
 			void *addr = PNMB(na, slot, &paddr);
 
 			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 				goto ring_reset;
 
 			vaddr = addr;
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, fl->ifl_ifdi->idi_tag, fl->ifl_sds.ifsd_map[nic_i], addr);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 			/*
 			 * XXX we should be batching this operation - TODO
 			 */
 			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i, &paddr, &vaddr, 1, fl->ifl_buf_size);
 			bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds.ifsd_map[nic_i],
 			    BUS_DMASYNC_PREREAD);
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = head;
 
 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		/*
 		 * IMPORTANT: we must leave one free slot in the ring,
 		 * so move nic_i back by one unit
 		 */
 		nic_i = nm_prev(nic_i, lim);
 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
 	}
 
 	return 0;
 
 ring_reset:
 	return netmap_ring_reinit(kring);
 }
 
 static int
 iflib_netmap_attach(if_ctx_t ctx)
 {
 	struct netmap_adapter na;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 
 	bzero(&na, sizeof(na));
 
 	na.ifp = ctx->ifc_ifp;
 	na.na_flags = NAF_BDG_MAYSLEEP;
 	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
 	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
 
 	na.num_tx_desc = scctx->isc_ntxd[0];
 	na.num_rx_desc = scctx->isc_nrxd[0];
 	na.nm_txsync = iflib_netmap_txsync;
 	na.nm_rxsync = iflib_netmap_rxsync;
 	na.nm_register = iflib_netmap_register;
 	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	return (netmap_attach(&na));
 }
 
 static void
 iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_slot *slot;
 
 	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
-	if (slot == 0)
+	if (slot == NULL)
 		return;
 
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 
 		/*
 		 * In netmap mode, set the map for the packet buffer.
 		 * NOTE: Some drivers (not this one) also need to set
 		 * the physical buffer address in the NIC ring.
 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
 		 * netmap slot index, si
 		 */
 		int si = netmap_idx_n2k(&na->tx_rings[txq->ift_id], i);
 		netmap_load_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si));
 	}
 }
 static void
 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_slot *slot;
 	bus_dmamap_t *map;
 	int nrxd;
 
 	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
-	if (slot == 0)
+	if (slot == NULL)
 		return;
 	map = rxq->ifr_fl[0].ifl_sds.ifsd_map;
 	nrxd = ctx->ifc_softc_ctx.isc_nrxd[0];
 	for (int i = 0; i < nrxd; i++, map++) {
 			int sj = netmap_idx_n2k(&na->rx_rings[rxq->ifr_id], i);
 			uint64_t paddr;
 			void *addr;
 			caddr_t vaddr;
 
 			vaddr = addr = PNMB(na, slot + sj, &paddr);
 			netmap_load_map(na, rxq->ifr_fl[0].ifl_ifdi->idi_tag, *map, addr);
 			/* Update descriptor and the cached value */
 			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, i, &paddr, &vaddr, 1, rxq->ifr_fl[0].ifl_buf_size);
 	}
 	/* preserve queue */
 	if (ctx->ifc_ifp->if_capenable & IFCAP_NETMAP) {
 		struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id];
 		int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring);
 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, t);
 	} else
 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, nrxd-1);
 }
 
 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
 
 #else
 #define iflib_netmap_txq_init(ctx, txq)
 #define iflib_netmap_rxq_init(ctx, rxq)
 #define iflib_netmap_detach(ifp)
 
 #define iflib_netmap_attach(ctx) (0)
 #define netmap_rx_irq(ifp, qid, budget) (0)
 
 #endif
 
 #if defined(__i386__) || defined(__amd64__)
 static __inline void
 prefetch(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 }
 #else
 #define prefetch(x)
 #endif
 
 static void
 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 {
 	if (err)
 		return;
 	*(bus_addr_t *) arg = segs[0].ds_addr;
 }
 
 int
 iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
 {
 	int err;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	device_t dev = ctx->ifc_dev;
 
 	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
 
 	err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 				sctx->isc_q_align, 0,	/* alignment, bounds */
 				BUS_SPACE_MAXADDR,	/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
 				size,			/* maxsize */
 				1,			/* nsegments */
 				size,			/* maxsegsize */
 				BUS_DMA_ALLOCNOW,	/* flags */
 				NULL,			/* lockfunc */
 				NULL,			/* lockarg */
 				&dma->idi_tag);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dma_tag_create failed: %d\n",
 		    __func__, err);
 		goto fail_0;
 	}
 
 	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
 	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
 		    __func__, (uintmax_t)size, err);
 		goto fail_1;
 	}
 
 	dma->idi_paddr = IF_BAD_DMA;
 	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
 	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
 	if (err || dma->idi_paddr == IF_BAD_DMA) {
 		device_printf(dev,
 		    "%s: bus_dmamap_load failed: %d\n",
 		    __func__, err);
 		goto fail_2;
 	}
 
 	dma->idi_size = size;
 	return (0);
 
 fail_2:
 	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 fail_1:
 	bus_dma_tag_destroy(dma->idi_tag);
 fail_0:
 	dma->idi_tag = NULL;
 
 	return (err);
 }
 
 int
 iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
 {
 	int i, err;
 	iflib_dma_info_t *dmaiter;
 
 	dmaiter = dmalist;
 	for (i = 0; i < count; i++, dmaiter++) {
 		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
 			break;
 	}
 	if (err)
 		iflib_dma_free_multi(dmalist, i);
 	return (err);
 }
 
 void
 iflib_dma_free(iflib_dma_info_t dma)
 {
 	if (dma->idi_tag == NULL)
 		return;
 	if (dma->idi_paddr != IF_BAD_DMA) {
 		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
 		dma->idi_paddr = IF_BAD_DMA;
 	}
 	if (dma->idi_vaddr != NULL) {
 		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 		dma->idi_vaddr = NULL;
 	}
 	bus_dma_tag_destroy(dma->idi_tag);
 	dma->idi_tag = NULL;
 }
 
 void
 iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
 {
 	int i;
 	iflib_dma_info_t *dmaiter = dmalist;
 
 	for (i = 0; i < count; i++, dmaiter++)
 		iflib_dma_free(*dmaiter);
 }
 
 #ifdef EARLY_AP_STARTUP
 static const int iflib_started = 1;
 #else
 /*
  * We used to abuse the smp_started flag to decide if the queues have been
  * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
  * That gave bad races, since the SYSINIT() runs strictly after smp_started
  * is set.  Run a SYSINIT() strictly after that to just set a usable
  * completion flag.
  */
 
 static int iflib_started;
 
 static void
 iflib_record_started(void *arg)
 {
 	iflib_started = 1;
 }
 
 SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
 	iflib_record_started, NULL);
 #endif
 
 static int
 iflib_fast_intr(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 
 	if (!iflib_started)
 		return (FILTER_HANDLED);
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
 		return (FILTER_HANDLED);
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 	driver_filter_t filter, driver_intr_t handler, void *arg,
 				 char *name)
 {
 	int rc;
 	struct resource *res;
 	void *tag;
 	device_t dev = ctx->ifc_dev;
 
 	MPASS(rid < 512);
 	irq->ii_rid = rid;
 	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid,
 				     RF_SHAREABLE | RF_ACTIVE);
 	if (res == NULL) {
 		device_printf(dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 	irq->ii_res = res;
 	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
 	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
 						filter, handler, arg, &tag);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 					  rid, name ? name : "unknown", rc);
 		return (rc);
 	} else if (name)
 		bus_describe_intr(dev, res, tag, "%s", name);
 
 	irq->ii_tag = tag;
 	return (0);
 }
 
 
 /*********************************************************************
  *
  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
  *  the information needed to transmit a packet on the wire. This is
  *  called only once at attach, setup is done every reset.
  *
  **********************************************************************/
 
 static int
 iflib_txsd_alloc(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int err, nsegments, ntsosegments;
 
 	nsegments = scctx->isc_tx_nsegments;
 	ntsosegments = scctx->isc_tx_tso_segments_max;
 	MPASS(scctx->isc_ntxd[0] > 0);
 	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
 	MPASS(nsegments > 0);
 	MPASS(ntsosegments > 0);
 	/*
 	 * Setup DMA descriptor areas.
 	 */
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       sctx->isc_tx_maxsize,		/* maxsize */
 			       nsegments,	/* nsegments */
 			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_desc_tag))) {
 		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
 		device_printf(dev,"maxsize: %zd nsegments: %d maxsegsize: %zd\n",
 					  sctx->isc_tx_maxsize, nsegments, sctx->isc_tx_maxsegsize);
 		goto fail;
 	}
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       scctx->isc_tx_tso_size_max,		/* maxsize */
 			       ntsosegments,	/* nsegments */
 			       scctx->isc_tx_tso_segsize_max,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_tso_desc_tag))) {
 		device_printf(dev,"Unable to allocate TX TSO DMA tag: %d\n", err);
 
 		goto fail;
 	}
 	if (!(txq->ift_sds.ifsd_flags =
 	    (uint8_t *) malloc(sizeof(uint8_t) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	if (!(txq->ift_sds.ifsd_m =
 	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
         /* Create the descriptor buffer dma maps */
 #if defined(ACPI_DMAR) || (!(defined(__i386__) && !defined(__amd64__)))
 	if ((ctx->ifc_flags & IFC_DMAR) == 0)
 		return (0);
 
 	if (!(txq->ift_sds.ifsd_map =
 	    (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
 		err = bus_dmamap_create(txq->ift_desc_tag, 0, &txq->ift_sds.ifsd_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TX DMA map\n");
 			goto fail;
 		}
 	}
 #endif
 	return (0);
 fail:
 	/* We free all, it handles case where we are in the middle */
 	iflib_tx_structures_free(ctx);
 	return (err);
 }
 
 static void
 iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	bus_dmamap_t map;
 
 	map = NULL;
 	if (txq->ift_sds.ifsd_map != NULL)
 		map = txq->ift_sds.ifsd_map[i];
 	if (map != NULL) {
 		bus_dmamap_unload(txq->ift_desc_tag, map);
 		bus_dmamap_destroy(txq->ift_desc_tag, map);
 		txq->ift_sds.ifsd_map[i] = NULL;
 	}
 }
 
 static void
 iflib_txq_destroy(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 
 	for (int i = 0; i < txq->ift_size; i++)
 		iflib_txsd_destroy(ctx, txq, i);
 	if (txq->ift_sds.ifsd_map != NULL) {
 		free(txq->ift_sds.ifsd_map, M_IFLIB);
 		txq->ift_sds.ifsd_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_m != NULL) {
 		free(txq->ift_sds.ifsd_m, M_IFLIB);
 		txq->ift_sds.ifsd_m = NULL;
 	}
 	if (txq->ift_sds.ifsd_flags != NULL) {
 		free(txq->ift_sds.ifsd_flags, M_IFLIB);
 		txq->ift_sds.ifsd_flags = NULL;
 	}
 	if (txq->ift_desc_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_desc_tag);
 		txq->ift_desc_tag = NULL;
 	}
 	if (txq->ift_tso_desc_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_tso_desc_tag);
 		txq->ift_tso_desc_tag = NULL;
 	}
 }
 
 static void
 iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	struct mbuf **mp;
 
 	mp = &txq->ift_sds.ifsd_m[i];
 	if (*mp == NULL)
 		return;
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		bus_dmamap_sync(txq->ift_desc_tag,
 				txq->ift_sds.ifsd_map[i],
 				BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_desc_tag,
 				  txq->ift_sds.ifsd_map[i]);
 	}
 	m_free(*mp);
 	DBG_COUNTER_INC(tx_frees);
 	*mp = NULL;
 }
 
 static int
 iflib_txq_setup(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	iflib_dma_info_t di;
 	int i;
 
 	/* Set number of descriptors available */
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 
 	/* Reset indices */
 	txq->ift_cidx_processed = txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
 	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
 
 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
 		bzero((void *)di->idi_vaddr, di->idi_size);
 
 	IFDI_TXQ_SETUP(ctx, txq->ift_id);
 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
 		bus_dmamap_sync(di->idi_tag, di->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Allocate memory for rx_buffer structures. Since we use one
  *  rx_buffer per received packet, the maximum number of rx_buffer's
  *  that we'll need is equal to the number of receive descriptors
  *  that we've allocated.
  *
  **********************************************************************/
 static int
 iflib_rxsd_alloc(iflib_rxq_t rxq)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	iflib_fl_t fl;
 	int			err;
 
 	MPASS(scctx->isc_nrxd[0] > 0);
 	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
 
 	fl = rxq->ifr_fl;
 	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
 		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
 		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 					 1, 0,			/* alignment, bounds */
 					 BUS_SPACE_MAXADDR,	/* lowaddr */
 					 BUS_SPACE_MAXADDR,	/* highaddr */
 					 NULL, NULL,		/* filter, filterarg */
 					 sctx->isc_rx_maxsize,	/* maxsize */
 					 sctx->isc_rx_nsegments,	/* nsegments */
 					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
 					 0,			/* flags */
 					 NULL,			/* lockfunc */
 					 NULL,			/* lockarg */
 					 &fl->ifl_desc_tag);
 		if (err) {
 			device_printf(dev, "%s: bus_dma_tag_create failed %d\n",
 				__func__, err);
 			goto fail;
 		}
 		if (!(fl->ifl_sds.ifsd_flags =
 		      (uint8_t *) malloc(sizeof(uint8_t) *
 					 scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate tx_buffer memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 		if (!(fl->ifl_sds.ifsd_m =
 		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate tx_buffer memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 		if (!(fl->ifl_sds.ifsd_cl =
 		      (caddr_t *) malloc(sizeof(caddr_t) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate tx_buffer memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/* Create the descriptor buffer dma maps */
 #if defined(ACPI_DMAR) || (!(defined(__i386__) && !defined(__amd64__)))
 		if ((ctx->ifc_flags & IFC_DMAR) == 0)
 			continue;
 
 		if (!(fl->ifl_sds.ifsd_map =
 		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate tx_buffer map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
 			err = bus_dmamap_create(fl->ifl_desc_tag, 0, &fl->ifl_sds.ifsd_map[i]);
 			if (err != 0) {
 				device_printf(dev, "Unable to create TX DMA map\n");
 				goto fail;
 			}
 		}
 #endif
 	}
 	return (0);
 
 fail:
 	iflib_rx_structures_free(ctx);
 	return (err);
 }
 
 
 /*
  * Internal service routines
  */
 
 struct rxq_refill_cb_arg {
 	int               error;
 	bus_dma_segment_t seg;
 	int               nseg;
 };
 
 static void
 _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct rxq_refill_cb_arg *cb_arg = arg;
 
 	cb_arg->error = error;
 	cb_arg->seg = segs[0];
 	cb_arg->nseg = nseg;
 }
 
 
 #ifdef ACPI_DMAR
 #define IS_DMAR(ctx) (ctx->ifc_flags & IFC_DMAR)
 #else
 #define IS_DMAR(ctx) (0)
 #endif
 
 /**
  *	rxq_refill - refill an rxq  free-buffer list
  *	@ctx: the iflib context
  *	@rxq: the free-list to refill
  *	@n: the number of new buffers to allocate
  *
  *	(Re)populate an rxq free-buffer list with up to @n new packet buffers.
  *	The caller must assure that @n does not exceed the queue's capacity.
  */
 static void
 _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
 {
 	struct mbuf *m;
 	int idx, pidx = fl->ifl_pidx;
 	caddr_t cl, *sd_cl;
 	struct mbuf **sd_m;
 	uint8_t *sd_flags;
 	bus_dmamap_t *sd_map;
 	int n, i = 0;
 	uint64_t bus_addr;
 	int err;
 
 	sd_m = fl->ifl_sds.ifsd_m;
 	sd_map = fl->ifl_sds.ifsd_map;
 	sd_cl = fl->ifl_sds.ifsd_cl;
 	sd_flags = fl->ifl_sds.ifsd_flags;
 	idx = pidx;
 
 	n  = count;
 	MPASS(n > 0);
 	MPASS(fl->ifl_credits + n <= fl->ifl_size);
 
 	if (pidx < fl->ifl_cidx)
 		MPASS(pidx + n <= fl->ifl_cidx);
 	if (pidx == fl->ifl_cidx && (fl->ifl_credits < fl->ifl_size))
 		MPASS(fl->ifl_gen == 0);
 	if (pidx > fl->ifl_cidx)
 		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
 
 	DBG_COUNTER_INC(fl_refills);
 	if (n > 8)
 		DBG_COUNTER_INC(fl_refills_large);
 
 	while (n--) {
 		/*
 		 * We allocate an uninitialized mbuf + cluster, mbuf is
 		 * initialized after rx.
 		 *
 		 * If the cluster is still set then we know a minimum sized packet was received
 		 */
 		if ((cl = sd_cl[idx]) == NULL) {
 			if ((cl = sd_cl[idx] = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
 				break;
 #if MEMORY_LOGGING
 			fl->ifl_cl_enqueued++;
 #endif
 		}
 		if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
 			break;
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_enqueued++;
 #endif
 
 		DBG_COUNTER_INC(rx_allocs);
 #ifdef notyet
 		if ((sd_flags[pidx] & RX_SW_DESC_MAP_CREATED) == 0) {
 			int err;
 
 			if ((err = bus_dmamap_create(fl->ifl_ifdi->idi_tag, 0, &sd_map[idx]))) {
 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
 				uma_zfree(fl->ifl_zone, cl);
 				n = 0;
 				goto done;
 			}
 			sd_flags[idx] |= RX_SW_DESC_MAP_CREATED;
 		}
 #endif
 #if defined(__i386__) || defined(__amd64__)
 		if (!IS_DMAR(ctx)) {
 			bus_addr = pmap_kextract((vm_offset_t)cl);
 		} else
 #endif
 		{
 			struct rxq_refill_cb_arg cb_arg;
 			iflib_rxq_t q;
 
 			cb_arg.error = 0;
 			q = fl->ifl_rxq;
 			err = bus_dmamap_load(fl->ifl_desc_tag, sd_map[idx],
 		         cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, 0);
 
 			if (err != 0 || cb_arg.error) {
 				/*
 				 * !zone_pack ?
 				 */
 				if (fl->ifl_zone == zone_pack)
 					uma_zfree(fl->ifl_zone, cl);
 				m_free(m);
 				n = 0;
 				goto done;
 			}
 			bus_addr = cb_arg.seg.ds_addr;
 		}
 		sd_flags[idx] |= RX_SW_DESC_INUSE;
 
 		MPASS(sd_m[idx] == NULL);
 		sd_cl[idx] = cl;
 		sd_m[idx] = m;
 		fl->ifl_bus_addrs[i] = bus_addr;
 		fl->ifl_vm_addrs[i] = cl;
 		fl->ifl_credits++;
 		i++;
 		MPASS(fl->ifl_credits <= fl->ifl_size);
 		if (++idx == fl->ifl_size) {
 			fl->ifl_gen = 1;
 			idx = 0;
 		}
 		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
 			ctx->isc_rxd_refill(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx,
 								 fl->ifl_bus_addrs, fl->ifl_vm_addrs, i, fl->ifl_buf_size);
 			i = 0;
 			pidx = idx;
 		}
 		fl->ifl_pidx = idx;
 
 	}
 done:
 	DBG_COUNTER_INC(rxd_flush);
 	if (fl->ifl_pidx == 0)
 		pidx = fl->ifl_size - 1;
 	else
 		pidx = fl->ifl_pidx - 1;
 	ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx);
 }
 
 static __inline void
 __iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max)
 {
 	/* we avoid allowing pidx to catch up with cidx as it confuses ixl */
 	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
 #ifdef INVARIANTS
 	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
 #endif
 
 	MPASS(fl->ifl_credits <= fl->ifl_size);
 	MPASS(reclaimable == delta);
 
 	if (reclaimable > 0)
 		_iflib_fl_refill(ctx, fl, min(max, reclaimable));
 }
 
 static void
 iflib_fl_bufs_free(iflib_fl_t fl)
 {
 	iflib_dma_info_t idi = fl->ifl_ifdi;
 	uint32_t i;
 
 	for (i = 0; i < fl->ifl_size; i++) {
 		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
 		uint8_t *sd_flags = &fl->ifl_sds.ifsd_flags[i];
 		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
 
 		if (*sd_flags & RX_SW_DESC_INUSE) {
 			if (fl->ifl_sds.ifsd_map != NULL) {
 				bus_dmamap_t sd_map = fl->ifl_sds.ifsd_map[i];
 				bus_dmamap_unload(fl->ifl_desc_tag, sd_map);
 				bus_dmamap_destroy(fl->ifl_desc_tag, sd_map);
 			}
 			if (*sd_m != NULL) {
 				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
 				uma_zfree(zone_mbuf, *sd_m);
 			}
 			if (*sd_cl != NULL)
 				uma_zfree(fl->ifl_zone, *sd_cl);
 			*sd_flags = 0;
 		} else {
 			MPASS(*sd_cl == NULL);
 			MPASS(*sd_m == NULL);
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_dequeued++;
 		fl->ifl_cl_dequeued++;
 #endif
 		*sd_cl = NULL;
 		*sd_m = NULL;
 	}
 	/*
 	 * Reset free list values
 	 */
 	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = 0;;
 	bzero(idi->idi_vaddr, idi->idi_size);
 }
 
 /*********************************************************************
  *
  *  Initialize a receive ring and its buffers.
  *
  **********************************************************************/
 static int
 iflib_fl_setup(iflib_fl_t fl)
 {
 	iflib_rxq_t rxq = fl->ifl_rxq;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
 	/*
 	** Free current RX buffer structs and their mbufs
 	*/
 	iflib_fl_bufs_free(fl);
 	/* Now replenish the mbufs */
 	MPASS(fl->ifl_credits == 0);
 	/*
 	 * XXX don't set the max_frame_size to larger
 	 * than the hardware can handle
 	 */
 	if (sctx->isc_max_frame_size <= 2048)
 		fl->ifl_buf_size = MCLBYTES;
 	else if (sctx->isc_max_frame_size <= 4096)
 		fl->ifl_buf_size = MJUMPAGESIZE;
 	else if (sctx->isc_max_frame_size <= 9216)
 		fl->ifl_buf_size = MJUM9BYTES;
 	else
 		fl->ifl_buf_size = MJUM16BYTES;
 	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
 		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
 	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
 	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 
 
 	/* avoid pre-allocating zillions of clusters to an idle card
 	 * potentially speeding up attach
 	 */
 	_iflib_fl_refill(ctx, fl, min(128, fl->ifl_size));
 	MPASS(min(128, fl->ifl_size) == fl->ifl_credits);
 	if (min(128, fl->ifl_size) != fl->ifl_credits)
 		return (ENOBUFS);
 	/*
 	 * handle failure
 	 */
 	MPASS(rxq != NULL);
 	MPASS(fl->ifl_ifdi != NULL);
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Free receive ring data structures
  *
  **********************************************************************/
 static void
 iflib_rx_sds_free(iflib_rxq_t rxq)
 {
 	iflib_fl_t fl;
 	int i;
 
 	if (rxq->ifr_fl != NULL) {
 		for (i = 0; i < rxq->ifr_nfl; i++) {
 			fl = &rxq->ifr_fl[i];
 			if (fl->ifl_desc_tag != NULL) {
 				bus_dma_tag_destroy(fl->ifl_desc_tag);
 				fl->ifl_desc_tag = NULL;
 			}
 			free(fl->ifl_sds.ifsd_m, M_IFLIB);
 			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
 			/* XXX destroy maps first */
 			free(fl->ifl_sds.ifsd_map, M_IFLIB);
 			fl->ifl_sds.ifsd_m = NULL;
 			fl->ifl_sds.ifsd_cl = NULL;
 			fl->ifl_sds.ifsd_map = NULL;
 		}
 		free(rxq->ifr_fl, M_IFLIB);
 		rxq->ifr_fl = NULL;
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 	}
 }
 
 /*
  * MI independent logic
  *
  */
 static void
 iflib_timer(void *arg)
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 	/*
 	** Check on the state of the TX queue(s), this
 	** can be done without the lock because its RO
 	** and the HUNG state will be static if set.
 	*/
 	IFDI_TIMER(ctx, txq->ift_id);
 	if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
 		(ctx->ifc_pause_frames == 0))
 		goto hung;
 
 	if (TXQ_AVAIL(txq) <= 2*scctx->isc_tx_nsegments ||
 	    ifmp_ring_is_stalled(txq->ift_br[0]))
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 
 	ctx->ifc_pause_frames = 0;
 	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) 
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
 	return;
 hung:
 	CTX_LOCK(ctx);
 	if_setdrvflagbits(ctx->ifc_ifp, 0, IFF_DRV_RUNNING);
 	device_printf(ctx->ifc_dev,  "TX(%d) desc avail = %d, pidx = %d\n",
 				  txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
 
 	IFDI_WATCHDOG_RESET(ctx);
 	ctx->ifc_watchdog_events++;
 	ctx->ifc_pause_frames = 0;
 
 	iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_init_locked(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
 
 
 	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	IFDI_INTR_DISABLE(ctx);
 
 	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
 	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
 	/* Set hardware offload abilities */
 	if_clearhwassist(ifp);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
 		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
 		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO4)
 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 
 	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		callout_stop(&txq->ift_db_check);
 		CALLOUT_UNLOCK(txq);
 		iflib_netmap_txq_init(ctx, txq);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
 		iflib_netmap_rxq_init(ctx, rxq);
 	}
 #ifdef INVARIANTS
 	i = if_getdrvflags(ifp);
 #endif
 	IFDI_INIT(ctx);
 	MPASS(if_getdrvflags(ifp) == i);
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			if (iflib_fl_setup(fl)) {
 				device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n");
 				goto done;
 			}
 		}
 	}
 	done:
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 	IFDI_INTR_ENABLE(ctx);
 	txq = ctx->ifc_txqs;
 	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq,
 			txq->ift_timer.c_cpu);
 }
 
 static int
 iflib_media_change(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	int err;
 
 	CTX_LOCK(ctx);
 	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 static void
 iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	CTX_LOCK(ctx);
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	IFDI_MEDIA_STATUS(ctx, ifmr);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_stop(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	iflib_dma_info_t di;
 	iflib_fl_t fl;
 	int i, j;
 
 	/* Tell the stack that the interface is no longer active */
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 
 	IFDI_INTR_DISABLE(ctx);
 	DELAY(100000);
 	IFDI_STOP(ctx);
 	DELAY(100000);
 
 	iflib_debug_reset();
 	/* Wait for current tx queue users to exit to disarm watchdog timer. */
 	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
 		/* clean any enqueued buffers */
 		iflib_ifmp_purge(txq);
 		/* Free any existing tx buffers. */
 		for (j = 0; j < txq->ift_size; j++) {
 			iflib_txsd_free(ctx, txq, j);
 		}
 		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
 		txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
 		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
 		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
 		txq->ift_pullups = 0;
 		ifmp_ring_reset_stats(txq->ift_br[0]);
 		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwtxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 	}
 	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
 		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwrxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 		/* also resets the free lists pidx/cidx */
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			iflib_fl_bufs_free(fl);
 	}
 }
 
 static inline void
 prefetch_pkts(iflib_fl_t fl, int cidx)
 {
 	int nextptr;
 	int nrxd = fl->ifl_size;
 
 	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
 	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
 	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
 }
 
 static void
 rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int *cltype, int unload, iflib_fl_t *pfl, int *pcidx)
 {
 	int flid, cidx;
 	bus_dmamap_t map;
 	iflib_fl_t fl;
 	iflib_dma_info_t di;
 	int next;
 
 	flid = irf->irf_flid;
 	cidx = irf->irf_idx;
 	fl = &rxq->ifr_fl[flid];
 	fl->ifl_credits--;
 #if MEMORY_LOGGING
 	fl->ifl_m_dequeued++;
 	if (cltype)
 		fl->ifl_cl_dequeued++;
 #endif
 	prefetch_pkts(fl, cidx);
 	if (fl->ifl_sds.ifsd_map != NULL) {
 		next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
 		prefetch(&fl->ifl_sds.ifsd_map[next]);
 		map = fl->ifl_sds.ifsd_map[cidx];
 		di = fl->ifl_ifdi;
 		next = (cidx + CACHE_LINE_SIZE) & (fl->ifl_size-1);
 		prefetch(&fl->ifl_sds.ifsd_flags[next]);
 		bus_dmamap_sync(di->idi_tag, di->idi_map,
 				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/* not valid assert if bxe really does SGE from non-contiguous elements */
 		MPASS(fl->ifl_cidx == cidx);
 		if (unload)
 			bus_dmamap_unload(fl->ifl_desc_tag, map);
 	}
 	if (__predict_false(++fl->ifl_cidx == fl->ifl_size)) {
 		fl->ifl_cidx = 0;
 		fl->ifl_gen = 0;
 	}
 	/* YES ick */
 	if (cltype)
 		*cltype = fl->ifl_cltype;
 	*pfl = fl;
 	*pcidx = cidx;
 }
 
 static struct mbuf *
 assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
 	int i, padlen , flags, cltype;
 	struct mbuf *m, *mh, *mt, *sd_m;
 	iflib_fl_t fl;
 	int cidx;
 	caddr_t cl, sd_cl;
 
 	i = 0;
 	mh = NULL;
 	do {
 		rxd_frag_to_sd(rxq, &ri->iri_frags[i], &cltype, TRUE, &fl, &cidx);
 		sd_m = fl->ifl_sds.ifsd_m[cidx];
 		sd_cl = fl->ifl_sds.ifsd_cl[cidx];
 
 		MPASS(sd_cl != NULL);
 		MPASS(sd_m != NULL);
 
 		/* Don't include zero-length frags */
 		if (ri->iri_frags[i].irf_len == 0) {
 			/* XXX we can save the cluster here, but not the mbuf */
 			m_init(sd_m, M_NOWAIT, MT_DATA, 0);
 			m_free(sd_m);
 			fl->ifl_sds.ifsd_m[cidx] = NULL;
 			continue;
 		}
 		m = sd_m;
 		if (mh == NULL) {
 			flags = M_PKTHDR|M_EXT;
 			mh = mt = m;
 			padlen = ri->iri_pad;
 		} else {
 			flags = M_EXT;
 			mt->m_next = m;
 			mt = m;
 			/* assuming padding is only on the first fragment */
 			padlen = 0;
 		}
 		fl->ifl_sds.ifsd_m[cidx] = NULL;
 		cl = fl->ifl_sds.ifsd_cl[cidx];
 		fl->ifl_sds.ifsd_cl[cidx] = NULL;
 
 		/* Can these two be made one ? */
 		m_init(m, M_NOWAIT, MT_DATA, flags);
 		m_cljset(m, cl, cltype);
 		/*
 		 * These must follow m_init and m_cljset
 		 */
 		m->m_data += padlen;
 		ri->iri_len -= padlen;
 		m->m_len = ri->iri_frags[i].irf_len;
 	} while (++i < ri->iri_nfrags);
 
 	return (mh);
 }
 
 /*
  * Process one software descriptor
  */
 static struct mbuf *
 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
 	struct mbuf *m;
 	iflib_fl_t fl;
 	caddr_t sd_cl;
 	int cidx;
 
 	/* should I merge this back in now that the two paths are basically duplicated? */
 	if (ri->iri_nfrags == 1 &&
 	    ri->iri_frags[0].irf_len <= IFLIB_RX_COPY_THRESH) {
 		rxd_frag_to_sd(rxq, &ri->iri_frags[0], NULL, FALSE, &fl, &cidx);
 		m = fl->ifl_sds.ifsd_m[cidx];
 		fl->ifl_sds.ifsd_m[cidx] = NULL;
 		sd_cl = fl->ifl_sds.ifsd_cl[cidx];
 		m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
 		memcpy(m->m_data, sd_cl, ri->iri_len);
 		m->m_len = ri->iri_frags[0].irf_len;
        } else {
 		m = assemble_segments(rxq, ri);
 	}
 	m->m_pkthdr.len = ri->iri_len;
 	m->m_pkthdr.rcvif = ri->iri_ifp;
 	m->m_flags |= ri->iri_flags;
 	m->m_pkthdr.ether_vtag = ri->iri_vtag;
 	m->m_pkthdr.flowid = ri->iri_flowid;
 	M_HASHTYPE_SET(m, ri->iri_rsstype);
 	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
 	m->m_pkthdr.csum_data = ri->iri_csum_data;
 	return (m);
 }
 
 static bool
 iflib_rxeof(iflib_rxq_t rxq, int budget)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int avail, i;
 	uint16_t *cidxp;
 	struct if_rxd_info ri;
 	int err, budget_left, rx_bytes, rx_pkts;
 	iflib_fl_t fl;
 	struct ifnet *ifp;
 	int lro_enabled;
 	/*
 	 * XXX early demux data packets so that if_input processing only handles
 	 * acks in interrupt context
 	 */
 	struct mbuf *m, *mh, *mt;
 
 	if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &budget)) {
 		return (FALSE);
 	}
 
 	mh = mt = NULL;
 	MPASS(budget > 0);
 	rx_pkts	= rx_bytes = 0;
 	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidxp = &rxq->ifr_cq_cidx;
 	else
 		cidxp = &rxq->ifr_fl[0].ifl_cidx;
 	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
 		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 			__iflib_fl_refill_lt(ctx, fl, budget + 8);
 		DBG_COUNTER_INC(rx_unavail);
 		return (false);
 	}
 
 	for (budget_left = budget; (budget_left > 0) && (avail > 0); budget_left--, avail--) {
 		if (__predict_false(!CTX_ACTIVE(ctx))) {
 			DBG_COUNTER_INC(rx_ctx_inactive);
 			break;
 		}
 		/*
 		 * Reset client set fields to their default values
 		 */
 		bzero(&ri, sizeof(ri));
 		ri.iri_qsidx = rxq->ifr_id;
 		ri.iri_cidx = *cidxp;
 		ri.iri_ifp = ctx->ifc_ifp;
 		ri.iri_frags = rxq->ifr_frags;
 		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 
 		/* in lieu of handling correctly - make sure it isn't being unhandled */
 		MPASS(err == 0);
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			*cidxp = ri.iri_cidx;
 			/* Update our consumer index */
 			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) {
 				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
 				rxq->ifr_cq_gen = 0;
 			}
 			/* was this only a completion queue message? */
 			if (__predict_false(ri.iri_nfrags == 0))
 				continue;
 		}
 		MPASS(ri.iri_nfrags != 0);
 		MPASS(ri.iri_len != 0);
 
 		/* will advance the cidx on the corresponding free lists */
 		m = iflib_rxd_pkt_get(rxq, &ri);
 		if (avail == 0 && budget_left)
 			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
 
 		if (__predict_false(m == NULL)) {
 			DBG_COUNTER_INC(rx_mbuf_null);
 			continue;
 		}
 		/* imm_pkt: -- cxgb */
 		if (mh == NULL)
 			mh = mt = m;
 		else {
 			mt->m_nextpkt = m;
 			mt = m;
 		}
 	}
 	/* make sure that we can refill faster than drain */
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 		__iflib_fl_refill_lt(ctx, fl, budget + 8);
 
 	ifp = ctx->ifc_ifp;
 	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
 	while (mh != NULL) {
 		m = mh;
 		mh = mh->m_nextpkt;
 		m->m_nextpkt = NULL;
 		rx_bytes += m->m_pkthdr.len;
 		rx_pkts++;
 #if defined(INET6) || defined(INET)
 		if (lro_enabled && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
 			continue;
 #endif
 		DBG_COUNTER_INC(rx_if_input);
 		ifp->if_input(ifp, m);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 
 	/*
 	 * Flush any outstanding LRO work
 	 */
 #if defined(INET6) || defined(INET)
 	tcp_lro_flush_all(&rxq->ifr_lc);
 #endif
 	if (avail)
 		return true;
 	return (iflib_rxd_avail(ctx, rxq, *cidxp, 1));
 }
 
 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
 #define TXQ_MAX_DB_DEFERRED(size) (size >> 5)
 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
 
 static __inline void
 iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring)
 {
 	uint32_t dbval;
 
 	if (ring || txq->ift_db_pending >=
 	    TXQ_MAX_DB_DEFERRED(txq->ift_size)) {
 
 		/* the lock will only ever be contended in the !min_latency case */
 		if (!TXDB_TRYLOCK(txq))
 			return;
 		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
 		txq->ift_db_pending = txq->ift_npending = 0;
 		TXDB_UNLOCK(txq);
 	}
 }
 
 static void
 iflib_txd_deferred_db_check(void * arg)
 {
 	iflib_txq_t txq = arg;
 
 	/* simple non-zero boolean so use bitwise OR */
 	if ((txq->ift_db_pending | txq->ift_npending) &&
 	    txq->ift_db_pending >= txq->ift_db_pending_queued)
 		iflib_txd_db_check(txq->ift_ctx, txq, TRUE);
 	txq->ift_db_pending_queued = 0;
 	if (ifmp_ring_is_stalled(txq->ift_br[0]))
 		iflib_txq_check_drain(txq, 4);
 }
 
 #ifdef PKT_DEBUG
 static void
 print_pkt(if_pkt_info_t pi)
 {
 	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
 	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
 	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
 	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
 	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
 	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
 }
 #endif
 
 #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
 
 static int
 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 {
 	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 	struct ether_vlan_header *eh;
 	struct mbuf *m, *n;
 
 	n = m = *mp;
 	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
 	    M_WRITABLE(m) == 0) {
 		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
 			return (ENOMEM);
 		} else {
 			m_freem(*mp);
 			n = *mp = m;
 		}
 	}
 
 	/*
 	 * Determine where frame payload starts.
 	 * Jump over vlan headers if already present,
 	 * helpful for QinQ too.
 	 */
 	if (__predict_false(m->m_len < sizeof(*eh))) {
 		txq->ift_pullups++;
 		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
 			return (ENOMEM);
 	}
 	eh = mtod(m, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		pi->ipi_etype = ntohs(eh->evl_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		pi->ipi_etype = ntohs(eh->evl_encap_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN;
 	}
 
 	switch (pi->ipi_etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = NULL;
 		struct tcphdr *th = NULL;
 		int minthlen;
 
 		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
 		if (__predict_false(m->m_len < minthlen)) {
 			/*
 			 * if this code bloat is causing too much of a hit
 			 * move it to a separate function and mark it noinline
 			 */
 			if (m->m_len == pi->ipi_ehdrlen) {
 				n = m->m_next;
 				MPASS(n);
 				if (n->m_len >= sizeof(*ip))  {
 					ip = (struct ip *)n->m_data;
 					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 				} else {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 						return (ENOMEM);
 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				}
 			} else {
 				txq->ift_pullups++;
 				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 					return (ENOMEM);
 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 			}
 		} else {
 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		}
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
 		if (pi->ipi_csum_flags & CSUM_IP)
                        ip->ip_sum = 0;
 
 		if (pi->ipi_ipproto == IPPROTO_TCP) {
 			if (__predict_false(th == NULL)) {
 				txq->ift_pullups++;
 				if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
 					return (ENOMEM);
 				th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
 			}
 			pi->ipi_tcp_hflags = th->th_flags;
 			pi->ipi_tcp_hlen = th->th_off << 2;
 			pi->ipi_tcp_seq = th->th_seq;
 		}
 		if (IS_TSO4(pi)) {
 			if (__predict_false(ip->ip_p != IPPROTO_TCP))
 				return (ENXIO);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 					       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 			if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
 				ip->ip_sum = 0;
 				ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
 			}
 		}
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 		struct tcphdr *th;
 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 
 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 				return (ENOMEM);
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
 
 		/* XXX-BZ this will go badly in case of ext hdrs. */
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_flags |= IPI_TX_IPV6;
 
 		if (pi->ipi_ipproto == IPPROTO_TCP) {
 			if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
 				if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
 					return (ENOMEM);
 			}
 			pi->ipi_tcp_hflags = th->th_flags;
 			pi->ipi_tcp_hlen = th->th_off << 2;
 		}
 		if (IS_TSO6(pi)) {
 
 			if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
 				return (ENXIO);
 			/*
 			 * The corresponding flag is set by the stack in the IPv4
 			 * TSO case, but not in IPv6 (at least in FreeBSD 10.2).
 			 * So, set it here because the rest of the flow requires it.
 			 */
 			pi->ipi_csum_flags |= CSUM_TCP_IPV6;
 			th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 		}
 		break;
 	}
 #endif
 	default:
 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 		pi->ipi_ip_hlen = 0;
 		break;
 	}
 	*mp = m;
 
 	return (0);
 }
 
 static  __noinline  struct mbuf *
 collapse_pkthdr(struct mbuf *m0)
 {
 	struct mbuf *m, *m_next, *tmp;
 
 	m = m0;
 	m_next = m->m_next;
 	while (m_next != NULL && m_next->m_len == 0) {
 		m = m_next;
 		m->m_next = NULL;
 		m_free(m);
 		m_next = m_next->m_next;
 	}
 	m = m0;
 	m->m_next = m_next;
 	if ((m_next->m_flags & M_EXT) == 0) {
 		m = m_defrag(m, M_NOWAIT);
 	} else {
 		tmp = m_next->m_next;
 		memcpy(m_next, m, MPKTHSIZE);
 		m = m_next;
 		m->m_next = tmp;
 	}
 	return (m);
 }
 
 /*
  * If dodgy hardware rejects the scatter gather chain we've handed it
  * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
  * m_defrag'd mbufs
  */
 static __noinline struct mbuf *
 iflib_remove_mbuf(iflib_txq_t txq)
 {
 	int ntxd, i, pidx;
 	struct mbuf *m, *mh, **ifsd_m;
 
 	pidx = txq->ift_pidx;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	mh = m = ifsd_m[pidx];
 	ifsd_m[pidx] = NULL;
 #if MEMORY_LOGGING
 	txq->ift_dequeued++;
 #endif
 	i = 1;
 
 	while (m) {
 		ifsd_m[(pidx + i) & (ntxd -1)] = NULL;
 #if MEMORY_LOGGING
 		txq->ift_dequeued++;
 #endif
 		m = m->m_next;
 		i++;
 	}
 	return (mh);
 }
 
 static int
 iflib_busdma_load_mbuf_sg(iflib_txq_t txq, bus_dma_tag_t tag, bus_dmamap_t map,
 			  struct mbuf **m0, bus_dma_segment_t *segs, int *nsegs,
 			  int max_segs, int flags)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
 	int i, next, pidx, mask, err, maxsegsz, ntxd, count;
 	struct mbuf *m, *tmp, **ifsd_m, **mp;
 
 	m = *m0;
 
 	/*
 	 * Please don't ever do this
 	 */
 	if (__predict_false(m->m_len == 0))
 		*m0 = m = collapse_pkthdr(m);
 
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	pidx = txq->ift_pidx;
 	if (map != NULL) {
 		uint8_t *ifsd_flags = txq->ift_sds.ifsd_flags;
 
 		err = bus_dmamap_load_mbuf_sg(tag, map,
 					      *m0, segs, nsegs, BUS_DMA_NOWAIT);
 		if (err)
 			return (err);
 		ifsd_flags[pidx] |= TX_SW_DESC_MAPPED;
 		i = 0;
 		next = pidx;
 		mask = (txq->ift_size-1);
 		m = *m0;
 		do {
 			mp = &ifsd_m[next];
 			*mp = m;
 			m = m->m_next;
 			if (__predict_false((*mp)->m_len == 0)) {
 				m_free(*mp);
 				*mp = NULL;
 			} else
 				next = (pidx + i) & (ntxd-1);
 		} while (m != NULL);
 	} else {
 		int buflen, sgsize, max_sgsize;
 		vm_offset_t vaddr;
 		vm_paddr_t curaddr;
 
 		count = i = 0;
 		maxsegsz = sctx->isc_tx_maxsize;
 		m = *m0;
 		do {
 			if (__predict_false(m->m_len <= 0)) {
 				tmp = m;
 				m = m->m_next;
 				tmp->m_next = NULL;
 				m_free(tmp);
 				continue;
 			}
 			buflen = m->m_len;
 			vaddr = (vm_offset_t)m->m_data;
 			/*
 			 * see if we can't be smarter about physically
 			 * contiguous mappings
 			 */
 			next = (pidx + count) & (ntxd-1);
 			MPASS(ifsd_m[next] == NULL);
 #if MEMORY_LOGGING
 			txq->ift_enqueued++;
 #endif
 			ifsd_m[next] = m;
 			while (buflen > 0) {
 				max_sgsize = MIN(buflen, maxsegsz);
 				curaddr = pmap_kextract(vaddr);
 				sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
 				sgsize = MIN(sgsize, max_sgsize);
 				segs[i].ds_addr = curaddr;
 				segs[i].ds_len = sgsize;
 				vaddr += sgsize;
 				buflen -= sgsize;
 				i++;
 				if (i >= max_segs)
 					goto err;
 			}
 			count++;
 			tmp = m;
 			m = m->m_next;
 		} while (m != NULL);
 		*nsegs = i;
 	}
 	return (0);
 err:
 	*m0 = iflib_remove_mbuf(txq);
 	return (EFBIG);
 }
 
 static int
 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
 {
 	if_ctx_t		ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
 	bus_dma_segment_t	*segs;
 	struct mbuf		*m_head;
 	bus_dmamap_t		map;
 	struct if_pkt_info	pi;
 	int remap = 0;
 	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
 	bus_dma_tag_t desc_tag;
 
 	segs = txq->ift_segs;
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	segs = txq->ift_segs;
 	ntxd = txq->ift_size;
 	m_head = *m_headp;
 	map = NULL;
 
 	/*
 	 * If we're doing TSO the next descriptor to clean may be quite far ahead
 	 */
 	cidx = txq->ift_cidx;
 	pidx = txq->ift_pidx;
 	next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
 
 	/* prefetch the next cache line of mbuf pointers and flags */
 	prefetch(&txq->ift_sds.ifsd_m[next]);
 	if (txq->ift_sds.ifsd_map != NULL) {
 		prefetch(&txq->ift_sds.ifsd_map[next]);
 		map = txq->ift_sds.ifsd_map[pidx];
 		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
 		prefetch(&txq->ift_sds.ifsd_flags[next]);
 	}
 
 
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 		desc_tag = txq->ift_tso_desc_tag;
 		max_segs = scctx->isc_tx_tso_segments_max;
 	} else {
 		desc_tag = txq->ift_desc_tag;
 		max_segs = scctx->isc_tx_nsegments;
 	}
 	m_head = *m_headp;
 	bzero(&pi, sizeof(pi));
 	pi.ipi_len = m_head->m_pkthdr.len;
 	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
 	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
 	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
 	pi.ipi_pidx = pidx;
 	pi.ipi_qsidx = txq->ift_id;
 
 	/* deliberate bitwise OR to make one condition */
 	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
 		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0))
 			return (err);
 		m_head = *m_headp;
 	}
 
 retry:
 	err = iflib_busdma_load_mbuf_sg(txq, desc_tag, map, m_headp, segs, &nsegs, max_segs, BUS_DMA_NOWAIT);
 defrag:
 	if (__predict_false(err)) {
 		switch (err) {
 		case EFBIG:
 			/* try collapse once and defrag once */
 			if (remap == 0)
 				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
 			if (remap == 1)
 				m_head = m_defrag(*m_headp, M_NOWAIT);
 			remap++;
 			if (__predict_false(m_head == NULL))
 				goto defrag_failed;
 			txq->ift_mbuf_defrag++;
 			*m_headp = m_head;
 			goto retry;
 			break;
 		case ENOMEM:
 			txq->ift_no_tx_dma_setup++;
 			break;
 		default:
 			txq->ift_no_tx_dma_setup++;
 			m_freem(*m_headp);
 			DBG_COUNTER_INC(tx_frees);
 			*m_headp = NULL;
 			break;
 		}
 		txq->ift_map_failed++;
 		DBG_COUNTER_INC(encap_load_mbuf_fail);
 		return (err);
 	}
 
 	/*
 	 * XXX assumes a 1 to 1 relationship between segments and
 	 *        descriptors - this does not hold true on all drivers, e.g.
 	 *        cxgb
 	 */
 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
 		txq->ift_no_desc_avail++;
 		if (map != NULL)
 			bus_dmamap_unload(desc_tag, map);
 		DBG_COUNTER_INC(encap_txq_avail_fail);
 		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		return (ENOBUFS);
 	}
 	pi.ipi_segs = segs;
 	pi.ipi_nsegs = nsegs;
 
 	MPASS(pidx >= 0 && pidx < txq->ift_size);
 #ifdef PKT_DEBUG
 	print_pkt(&pi);
 #endif
 	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		DBG_COUNTER_INC(tx_encap);
 		MPASS(pi.ipi_new_pidx >= 0 &&
 		    pi.ipi_new_pidx < txq->ift_size);
 
 		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
 		if (pi.ipi_new_pidx < pi.ipi_pidx) {
 			ndesc += txq->ift_size;
 			txq->ift_gen = 1;
 		}
 		/*
 		 * drivers can need as many as 
 		 * two sentinels
 		 */
 		MPASS(ndesc <= pi.ipi_nsegs + 2);
 		MPASS(pi.ipi_new_pidx != pidx);
 		MPASS(ndesc > 0);
 		txq->ift_in_use += ndesc;
 		/*
 		 * We update the last software descriptor again here because there may
 		 * be a sentinel and/or there may be more mbufs than segments
 		 */
 		txq->ift_pidx = pi.ipi_new_pidx;
 		txq->ift_npending += pi.ipi_ndescs;
 	} else if (__predict_false(err == EFBIG && remap < 2)) {
 		*m_headp = m_head = iflib_remove_mbuf(txq);
 		remap = 1;
 		txq->ift_txd_encap_efbig++;
 		goto defrag;
 	} else
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 	return (err);
 
 defrag_failed:
 	txq->ift_mbuf_defrag_failed++;
 	txq->ift_map_failed++;
 	m_freem(*m_headp);
 	DBG_COUNTER_INC(tx_frees);
 	*m_headp = NULL;
 	return (ENOMEM);
 }
 
 /* forward compatibility for cxgb */
 #define FIRST_QSET(ctx) 0
 
 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
 #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
 #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
 #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
 #define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max)
 
 
 
 /* if there are more than TXQ_MIN_OCCUPANCY packets pending we consider deferring
  * doorbell writes
  *
  * ORing with 2 assures that min occupancy is never less than 2 without any conditional logic
  */
 #define TXQ_MIN_OCCUPANCY(size) ((size >> 6)| 0x2)
 
 static inline int
 iflib_txq_min_occupancy(iflib_txq_t txq)
 {
 	if_ctx_t ctx;
 
 	ctx = txq->ift_ctx;
 	return (get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx,
 	    txq->ift_gen) < TXQ_MIN_OCCUPANCY(txq->ift_size) +
 	    MAX_TX_DESC(ctx));
 }
 
 static void
 iflib_tx_desc_free(iflib_txq_t txq, int n)
 {
 	int hasmap;
 	uint32_t qsize, cidx, mask, gen;
 	struct mbuf *m, **ifsd_m;
 	uint8_t *ifsd_flags;
 	bus_dmamap_t *ifsd_map;
 
 	cidx = txq->ift_cidx;
 	gen = txq->ift_gen;
 	qsize = txq->ift_size;
 	mask = qsize-1;
 	hasmap = txq->ift_sds.ifsd_map != NULL;
 	ifsd_flags = txq->ift_sds.ifsd_flags;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ifsd_map = txq->ift_sds.ifsd_map;
 
 	while (n--) {
 		prefetch(ifsd_m[(cidx + 3) & mask]);
 		prefetch(ifsd_m[(cidx + 4) & mask]);
 
 		if (ifsd_m[cidx] != NULL) {
 			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			prefetch(&ifsd_flags[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			if (hasmap && (ifsd_flags[cidx] & TX_SW_DESC_MAPPED)) {
 				/*
 				 * does it matter if it's not the TSO tag? If so we'll
 				 * have to add the type to flags
 				 */
 				bus_dmamap_unload(txq->ift_desc_tag, ifsd_map[cidx]);
 				ifsd_flags[cidx] &= ~TX_SW_DESC_MAPPED;
 			}
 			if ((m = ifsd_m[cidx]) != NULL) {
 				/* XXX we don't support any drivers that batch packets yet */
 				MPASS(m->m_nextpkt == NULL);
 
 				m_free(m);
 				ifsd_m[cidx] = NULL;
 #if MEMORY_LOGGING
 				txq->ift_dequeued++;
 #endif
 				DBG_COUNTER_INC(tx_frees);
 			}
 		}
 		if (__predict_false(++cidx == qsize)) {
 			cidx = 0;
 			gen = 0;
 		}
 	}
 	txq->ift_cidx = cidx;
 	txq->ift_gen = gen;
 }
 
 static __inline int
 iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
 {
 	int reclaim;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
 	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
 
 	/*
 	 * Need a rate-limiting check so that this isn't called every time
 	 */
 	iflib_tx_credits_update(ctx, txq);
 	reclaim = DESC_RECLAIMABLE(txq);
 
 	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
 #ifdef INVARIANTS
 		if (iflib_verbose_debug) {
 			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
 			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
 			       reclaim, thresh);
 
 		}
 #endif
 		return (0);
 	}
 	iflib_tx_desc_free(txq, reclaim);
 	txq->ift_cleaned += reclaim;
 	txq->ift_in_use -= reclaim;
 
 	if (txq->ift_active == FALSE)
 		txq->ift_active = TRUE;
 
 	return (reclaim);
 }
 
 static struct mbuf **
 _ring_peek_one(struct ifmp_ring *r, int cidx, int offset)
 {
 
 	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (r->size-1)]));
 }
 
 static void
 iflib_txq_check_drain(iflib_txq_t txq, int budget)
 {
 
 	ifmp_ring_check_drainage(txq->ift_br[0], budget);
 }
 
 static uint32_t
 iflib_txq_can_drain(struct ifmp_ring *r)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	return ((TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2) ||
 		ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, false));
 }
 
 static uint32_t
 iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	struct mbuf **mp, *m;
 	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail, err, in_use_prev, desc_used;
 
 	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
 			    !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(txq_drain_notready);
 		return (0);
 	}
 
 	avail = IDXDIFF(pidx, cidx, r->size);
 	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
 		DBG_COUNTER_INC(txq_drain_flushing);
 		for (i = 0; i < avail; i++) {
 			m_free(r->items[(cidx + i) & (r->size-1)]);
 			r->items[(cidx + i) & (r->size-1)] = NULL;
 		}
 		return (avail);
 	}
 	iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		callout_stop(&txq->ift_db_check);
 		CALLOUT_UNLOCK(txq);
 		DBG_COUNTER_INC(txq_drain_oactive);
 		return (0);
 	}
 	consumed = mcast_sent = bytes_sent = pkt_sent = 0;
 	count = MIN(avail, TX_BATCH_SIZE);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
 		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
 #endif
 
 	for (desc_used = i = 0; i < count && TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2; i++) {
 		mp = _ring_peek_one(r, cidx, i);
 		MPASS(mp != NULL && *mp != NULL);
 		in_use_prev = txq->ift_in_use;
 		if ((err = iflib_encap(txq, mp)) == ENOBUFS) {
 			DBG_COUNTER_INC(txq_drain_encapfail);
 			/* no room - bail out */
 			break;
 		}
 		consumed++;
 		if (err) {
 			DBG_COUNTER_INC(txq_drain_encapfail);
 			/* we can't send this packet - skip it */
 			continue;
 		}
 		pkt_sent++;
 		m = *mp;
 		DBG_COUNTER_INC(tx_sent);
 		bytes_sent += m->m_pkthdr.len;
 		if (m->m_flags & M_MCAST)
 			mcast_sent++;
 
 		txq->ift_db_pending += (txq->ift_in_use - in_use_prev);
 		desc_used += (txq->ift_in_use - in_use_prev);
 		iflib_txd_db_check(ctx, txq, FALSE);
 		ETHER_BPF_MTAP(ifp, m);
 		if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 			break;
 
 		if (desc_used >= TXQ_MAX_DB_CONSUMED(txq->ift_size))
 			break;
 	}
 
 	if ((iflib_min_tx_latency || iflib_txq_min_occupancy(txq)) && txq->ift_db_pending)
 		iflib_txd_db_check(ctx, txq, TRUE);
 	else if ((txq->ift_db_pending || TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2) &&
 		 (callout_pending(&txq->ift_db_check) == 0)) {
 		txq->ift_db_pending_queued = txq->ift_db_pending;
 		callout_reset_on(&txq->ift_db_check, 1, iflib_txd_deferred_db_check,
 				 txq, txq->ift_db_check.c_cpu);
 	}
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
 	if (mcast_sent)
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("consumed=%d\n", consumed);
 #endif
 	return (consumed);
 }
 
 static uint32_t
 iflib_txq_drain_always(struct ifmp_ring *r)
 {
 	return (1);
 }
 
 static uint32_t
 iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	int i, avail;
 	struct mbuf **mp;
 	iflib_txq_t txq;
 
 	txq = r->cookie;
 
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	CALLOUT_LOCK(txq);
 	callout_stop(&txq->ift_timer);
 	callout_stop(&txq->ift_db_check);
 	CALLOUT_UNLOCK(txq);
 
 	avail = IDXDIFF(pidx, cidx, r->size);
 	for (i = 0; i < avail; i++) {
 		mp = _ring_peek_one(r, cidx, i);
 		m_freem(*mp);
 	}
 	MPASS(ifmp_ring_is_stalled(r) == 0);
 	return (avail);
 }
 
 static void
 iflib_ifmp_purge(iflib_txq_t txq)
 {
 	struct ifmp_ring *r;
 
 	r = txq->ift_br[0];
 	r->drain = iflib_txq_drain_free;
 	r->can_drain = iflib_txq_drain_always;
 
 	ifmp_ring_check_drainage(r, r->size);
 
 	r->drain = iflib_txq_drain;
 	r->can_drain = iflib_txq_can_drain;
 }
 
 static void
 _task_fn_tx(void *context)
 {
 	iflib_txq_t txq = context;
 	if_ctx_t ctx = txq->ift_ctx;
 
 #ifdef IFLIB_DIAGNOSTICS
 	txq->ift_cpu_exec_count[curcpu]++;
 #endif
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 	ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
 }
 
 static void
 _task_fn_rx(void *context)
 {
 	iflib_rxq_t rxq = context;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	bool more;
 	int rc;
 
 #ifdef IFLIB_DIAGNOSTICS
 	rxq->ifr_cpu_exec_count[curcpu]++;
 #endif
 	DBG_COUNTER_INC(task_fn_rxs);
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 
 	if ((more = iflib_rxeof(rxq, 16 /* XXX */)) == false) {
 		if (ctx->ifc_flags & IFC_LEGACY)
 			IFDI_INTR_ENABLE(ctx);
 		else {
 			DBG_COUNTER_INC(rx_intr_enables);
 			rc = IFDI_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 			KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
 		}
 	}
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 	if (more)
 		GROUPTASK_ENQUEUE(&rxq->ifr_task);
 }
 
 static void
 _task_fn_admin(void *context)
 {
 	if_ctx_t ctx = context;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	iflib_txq_t txq;
 	int i;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 
 	CTX_LOCK(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 	}
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
 	IFDI_LINK_INTR_ENABLE(ctx);
 	CTX_UNLOCK(ctx);
 
 	if (LINK_ACTIVE(ctx) == 0)
 		return;
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 }
 
 
 static void
 _task_fn_iov(void *context)
 {
 	if_ctx_t ctx = context;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VFLR_HANDLE(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 	if_int_delay_info_t info;
 	if_ctx_t ctx;
 
 	info = (if_int_delay_info_t)arg1;
 	ctx = info->iidi_ctx;
 	info->iidi_req = req;
 	info->iidi_oidp = oidp;
 	CTX_LOCK(ctx);
 	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 /*********************************************************************
  *
  *  IFNET FUNCTIONS
  *
  **********************************************************************/
 
 static void
 iflib_if_init_locked(if_ctx_t ctx)
 {
 	iflib_stop(ctx);
 	iflib_init_locked(ctx);
 }
 
 
 static void
 iflib_if_init(void *arg)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_if_transmit(if_t ifp, struct mbuf *m)
 {
 	if_ctx_t	ctx = if_getsoftc(ifp);
 
 	iflib_txq_t txq;
 	int err, qidx;
 
 	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(tx_frees);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 
 	MPASS(m->m_nextpkt == NULL);
 	qidx = 0;
 	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m))
 		qidx = QIDX(ctx, m);
 	/*
 	 * XXX calculate buf_ring based on flowid (divvy up bits?)
 	 */
 	txq = &ctx->ifc_txqs[qidx];
 
 #ifdef DRIVER_BACKPRESSURE
 	if (txq->ift_closed) {
 		while (m != NULL) {
 			next = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 			m = next;
 		}
 		return (ENOBUFS);
 	}
 #endif
 #ifdef notyet
 	qidx = count = 0;
 	mp = marr;
 	next = m;
 	do {
 		count++;
 		next = next->m_nextpkt;
 	} while (next != NULL);
 
 	if (count > nitems(marr))
 		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
 			/* XXX check nextpkt */
 			m_freem(m);
 			/* XXX simplify for now */
 			DBG_COUNTER_INC(tx_frees);
 			return (ENOBUFS);
 		}
 	for (next = m, i = 0; next != NULL; i++) {
 		mp[i] = next;
 		next = next->m_nextpkt;
 		mp[i]->m_nextpkt = NULL;
 	}
 #endif
 	DBG_COUNTER_INC(tx_seen);
 	err = ifmp_ring_enqueue(txq->ift_br[0], (void **)&m, 1, TX_BATCH_SIZE);
 
 	if (err) {
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 		/* support forthcoming later */
 #ifdef DRIVER_BACKPRESSURE
 		txq->ift_closed = TRUE;
 #endif
 		ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
 		m_freem(m);
 	} else if (TXQ_AVAIL(txq) < (txq->ift_size >> 1)) {
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 	}
 
 	return (err);
 }
 
 static void
 iflib_if_qflush(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	CTX_LOCK(ctx);
 	ctx->ifc_flags |= IFC_QFLUSH;
 	CTX_UNLOCK(ctx);
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		while (!(ifmp_ring_is_idle(txq->ift_br[0]) || ifmp_ring_is_stalled(txq->ift_br[0])))
 			iflib_txq_check_drain(txq, 0);
 	CTX_LOCK(ctx);
 	ctx->ifc_flags &= ~IFC_QFLUSH;
 	CTX_UNLOCK(ctx);
 
 	if_qflush(ifp);
 }
 
 
 #define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
 		     IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING |	\
 		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO)
 
 static int
 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	struct ifreq	*ifr = (struct ifreq *)data;
 #if defined(INET) || defined(INET6)
 	struct ifaddr	*ifa = (struct ifaddr *)data;
 #endif
 	bool		avoid_reset = FALSE;
 	int		err = 0, reinit = 0, bits;
 
 	switch (command) {
 	case SIOCSIFADDR:
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			avoid_reset = TRUE;
 #endif
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6)
 			avoid_reset = TRUE;
 #endif
 		/*
 		** Calling init results in link renegotiation,
 		** so we avoid doing it when possible.
 		*/
 		if (avoid_reset) {
 			if_setflagbits(ifp, IFF_UP,0);
 			if (!(if_getdrvflags(ifp)& IFF_DRV_RUNNING))
 				reinit = 1;
 #ifdef INET
 			if (!(if_getflags(ifp) & IFF_NOARP))
 				arp_ifinit(ifp, ifa);
 #endif
 		} else
 			err = ether_ioctl(ifp, command, data);
 		break;
 	case SIOCSIFMTU:
 		CTX_LOCK(ctx);
 		if (ifr->ifr_mtu == if_getmtu(ifp)) {
 			CTX_UNLOCK(ctx);
 			break;
 		}
 		bits = if_getdrvflags(ifp);
 		/* stop the driver and free any clusters before proceeding */
 		iflib_stop(ctx);
 
 		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
 			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
 				ctx->ifc_flags |= IFC_MULTISEG;
 			else
 				ctx->ifc_flags &= ~IFC_MULTISEG;
 			err = if_setmtu(ifp, ifr->ifr_mtu);
 		}
 		iflib_init_locked(ctx);
 		if_setdrvflags(ifp, bits);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCSIFFLAGS:
 		CTX_LOCK(ctx);
 		if (if_getflags(ifp) & IFF_UP) {
 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
 				}
 			} else
 				reinit = 1;
 		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			iflib_stop(ctx);
 		}
 		ctx->ifc_if_flags = if_getflags(ifp);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			CTX_LOCK(ctx);
 			IFDI_INTR_DISABLE(ctx);
 			IFDI_MULTI_SET(ctx);
 			IFDI_INTR_ENABLE(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	case SIOCSIFMEDIA:
 		CTX_LOCK(ctx);
 		IFDI_MEDIA_SET(ctx);
 		CTX_UNLOCK(ctx);
 		/* falls thru */
 	case SIOCGIFMEDIA:
 		err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command);
 		break;
 	case SIOCGI2C:
 	{
 		struct ifi2creq i2c;
 
 		err = copyin(ifr->ifr_data, &i2c, sizeof(i2c));
 		if (err != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			err = EINVAL;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			err = EINVAL;
 			break;
 		}
 
 		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
 			err = copyout(&i2c, ifr->ifr_data, sizeof(i2c));
 		break;
 	}
 	case SIOCSIFCAP:
 	{
 		int mask, setmask;
 
 		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
 		setmask = 0;
 #ifdef TCP_OFFLOAD
 		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
 #endif
 		setmask |= (mask & IFCAP_FLAGS);
 
 		if (setmask  & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))
 			setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
 		if ((mask & IFCAP_WOL) &&
 		    (if_getcapabilities(ifp) & IFCAP_WOL) != 0)
 			setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC));
 		if_vlancap(ifp);
 		/*
 		 * want to ensure that traffic has stopped before we change any of the flags
 		 */
 		if (setmask) {
 			CTX_LOCK(ctx);
 			bits = if_getdrvflags(ifp);
 			if (bits & IFF_DRV_RUNNING)
 				iflib_stop(ctx);
 			if_togglecapenable(ifp, setmask);
 			if (bits & IFF_DRV_RUNNING)
 				iflib_init_locked(ctx);
 			if_setdrvflags(ifp, bits);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	    }
 	case SIOCGPRIVATE_0:
 	case SIOCSDRVSPEC:
 	case SIOCGDRVSPEC:
 		CTX_LOCK(ctx);
 		err = IFDI_PRIV_IOCTL(ctx, command, data);
 		CTX_UNLOCK(ctx);
 		break;
 	default:
 		err = ether_ioctl(ifp, command, data);
 		break;
 	}
 	if (reinit)
 		iflib_if_init(ctx);
 	return (err);
 }
 
 static uint64_t
 iflib_if_get_counter(if_t ifp, ift_counter cnt)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	return (IFDI_GET_COUNTER(ctx, cnt));
 }
 
 /*********************************************************************
  *
  *  OTHER FUNCTIONS EXPORTED TO THE STACK
  *
  **********************************************************************/
 
 static void
 iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VLAN_REGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VLAN_UNREGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_led_func(void *arg, int onoff)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	IFDI_LED_FUNC(ctx, onoff);
 	CTX_UNLOCK(ctx);
 }
 
 /*********************************************************************
  *
  *  BUS FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 int
 iflib_device_probe(device_t dev)
 {
 	pci_vendor_info_t *ent;
 
 	uint16_t	pci_vendor_id, pci_device_id;
 	uint16_t	pci_subvendor_id, pci_subdevice_id;
 	uint16_t	pci_rev_id;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_vendor_id = pci_get_vendor(dev);
 	pci_device_id = pci_get_device(dev);
 	pci_subvendor_id = pci_get_subvendor(dev);
 	pci_subdevice_id = pci_get_subdevice(dev);
 	pci_rev_id = pci_get_revid(dev);
 	if (sctx->isc_parse_devinfo != NULL)
 		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
 
 	ent = sctx->isc_vendor_info;
 	while (ent->pvi_vendor_id != 0) {
 		if (pci_vendor_id != ent->pvi_vendor_id) {
 			ent++;
 			continue;
 		}
 		if ((pci_device_id == ent->pvi_device_id) &&
 		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
 		     (ent->pvi_subvendor_id == 0)) &&
 		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
 		     (ent->pvi_subdevice_id == 0)) &&
 		    ((pci_rev_id == ent->pvi_rev_id) ||
 		     (ent->pvi_rev_id == 0))) {
 
 			device_set_desc_copy(dev, ent->pvi_name);
 			/* this needs to be changed to zero if the bus probing code
 			 * ever stops re-probing on best match because the sctx
 			 * may have its values over written by register calls
 			 * in subsequent probes
 			 */
 			return (BUS_PROBE_DEFAULT);
 		}
 		ent++;
 	}
 	return (ENXIO);
 }
 
 int
 iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
 {
 	int err, rid, msix, msix_bar;
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	int i;
 	uint16_t main_txq;
 	uint16_t main_rxq;
 
 
 	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
 
 	if (sc == NULL) {
 		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 		device_set_softc(dev, ctx);
 		ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	}
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_dev = dev;
 	ctx->ifc_softc = sc;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "iflib_register failed %d\n", err);
 		return (err);
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
 
 	/*
 	 * XXX sanity check that ntxd & nrxd are a power of 2
 	 */
 	if (ctx->ifc_sysctl_ntxqs != 0)
 		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
 	if (ctx->ifc_sysctl_nrxqs != 0)
 		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (ctx->ifc_sysctl_ntxds[i] != 0)
 			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
 		else
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (ctx->ifc_sysctl_nrxds[i] != 0)
 			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
 		else
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
 			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
 		}
 		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
 			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
 		}
 	}
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
 			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
 		}
 		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
 			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
 		}
 	}
 
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		return (err);
 	}
 	_iflib_pre_assert(scctx);
 	ctx->ifc_txrx = *scctx->isc_txrx;
 
 #ifdef INVARIANTS
 	MPASS(scctx->isc_capenable);
 	if (scctx->isc_capenable & IFCAP_TXCSUM)
 		MPASS(scctx->isc_tx_csum_flags);
 #endif
 
 	if_setcapabilities(ifp, scctx->isc_capenable);
 	if_setcapenable(ifp, scctx->isc_capenable);
 
 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 
 #ifdef ACPI_DMAR
 	if (dmar_get_dma_tag(device_get_parent(dev), dev) != NULL)
 		ctx->ifc_flags |= IFC_DMAR;
 #endif
 
 	msix_bar = scctx->isc_msix_bar;
 
 	if(sctx->isc_flags & IFLIB_HAS_TXCQ)
 		main_txq = 1;
 	else
 		main_txq = 0;
 
 	if(sctx->isc_flags & IFLIB_HAS_RXCQ)
 		main_rxq = 1;
 	else
 		main_rxq = 0;
 
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "using %d tx descriptors and %d rx descriptors\n",
 		      scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (!powerof2(scctx->isc_nrxd[i])) {
 			/* round down instead? */
 			device_printf(dev, "# rx descriptors must be a power of 2\n");
 			err = EINVAL;
 			goto fail;
 		}
 	}
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (!powerof2(scctx->isc_ntxd[i])) {
 			device_printf(dev,
 			    "# tx descriptors must be a power of 2");
 			err = EINVAL;
 			goto fail;
 		}
 	}
 
 	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
 
 	/*
 	 * Protect the stack against modern hardware
 	 */
 	if (scctx->isc_tx_tso_size_max > FREEBSD_TSO_SIZE_MAX)
 		scctx->isc_tx_tso_size_max = FREEBSD_TSO_SIZE_MAX;
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	ifp->if_hw_tsomaxsegcount = scctx->isc_tx_tso_segments_max;
 	ifp->if_hw_tsomax = scctx->isc_tx_tso_size_max;
 	ifp->if_hw_tsomaxsegsize = scctx->isc_tx_tso_segsize_max;
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 
 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 	/* XXX format name */
 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin");
 	/*
 	** Now setup MSI or MSI/X, should
 	** return us the number of supported
 	** vectors. (Will be 1 for MSI)
 	*/
 	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
 		msix = scctx->isc_vectors;
 	} else if (scctx->isc_msix_bar != 0)
 	       /*
 		* The simple fact that isc_msix_bar is not 0 does not mean we
 		* we have a good value there that is known to work.
 		*/
 		msix = iflib_msix_init(ctx);
 	else {
 		scctx->isc_vectors = 1;
 		scctx->isc_ntxqsets = 1;
 		scctx->isc_nrxqsets = 1;
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 		msix = 0;
 	}
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx))) {
 		device_printf(dev, "qset structure setup failed %d\n", err);
 		goto fail_queues;
 	}
 
 	/*
 	 * Group taskqueues aren't properly set up until SMP is started,
 	 * so we disable interrupts until we can handle them post
 	 * SI_SUB_SMP.
 	 *
 	 * XXX: disabling interrupts doesn't actually work, at least for
 	 * the non-MSI case.  When they occur before SI_SUB_SMP completes,
 	 * we do null handling and depend on this not causing too large an
 	 * interrupt storm.
 	 */
 	IFDI_INTR_DISABLE(ctx);
 	if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) {
 		device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err);
 		goto fail_intr_free;
 	}
 	if (msix <= 1) {
 		rid = 0;
 		if (scctx->isc_intr == IFLIB_INTR_MSI) {
 			MPASS(msix == 1);
 			rid = 1;
 		}
 		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
 			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
 			goto fail_intr_free;
 		}
 	}
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 	if ((err = iflib_netmap_attach(ctx))) {
 		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
 		goto fail_detach;
 	}
 	*ctxp = ctx;
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
 	ctx->ifc_flags |= IFC_INIT_DONE;
 	return (0);
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_intr_free:
 	if (scctx->isc_intr == IFLIB_INTR_MSIX || scctx->isc_intr == IFLIB_INTR_MSI)
 		pci_release_msi(ctx->ifc_dev);
 fail_queues:
 	/* XXX free queues */
 fail:
 	IFDI_DETACH(ctx);
 	return (err);
 }
 
 int
 iflib_device_attach(device_t dev)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_enable_busmaster(dev);
 
 	return (iflib_device_register(dev, NULL, sctx, &ctx));
 }
 
 int
 iflib_device_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	device_t dev = ctx->ifc_dev;
 	int i;
 	struct taskqgroup *tqg;
 
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(ifp)) {
 		device_printf(dev,"Vlan in use, detach first\n");
 		return (EBUSY);
 	}
 
 	CTX_LOCK(ctx);
 	ctx->ifc_in_detach = 1;
 	iflib_stop(ctx);
 	CTX_UNLOCK(ctx);
 
 	/* Unregister VLAN events */
 	if (ctx->ifc_vlan_attach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
 	if (ctx->ifc_vlan_detach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
 
 	iflib_netmap_detach(ifp);
 	ether_ifdetach(ifp);
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	CTX_LOCK_DESTROY(ctx);
 	if (ctx->ifc_led_dev != NULL)
 		led_destroy(ctx->ifc_led_dev);
 	/* XXX drain any dependent tasks */
 	tqg = qgroup_if_io_tqg;
 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		callout_drain(&txq->ift_timer);
 		callout_drain(&txq->ift_db_check);
 		if (txq->ift_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &txq->ift_task);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		if (rxq->ifr_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &rxq->ifr_task);
 	}
 	tqg = qgroup_if_config_tqg;
 	if (ctx->ifc_admin_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
 	if (ctx->ifc_vflr_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
 
 	IFDI_DETACH(ctx);
 	device_set_softc(ctx->ifc_dev, NULL);
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
 		pci_release_msi(dev);
 	}
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
 		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
 	}
 	if (ctx->ifc_msix_mem != NULL) {
 		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
 			ctx->ifc_softc_ctx.isc_msix_bar, ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 	}
 
 	bus_generic_detach(dev);
 	if_free(ifp);
 
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 
 int
 iflib_device_detach(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	return (iflib_device_deregister(ctx));
 }
 
 int
 iflib_device_suspend(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SUSPEND(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 int
 iflib_device_shutdown(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SHUTDOWN(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 
 
 int
 iflib_device_resume(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	CTX_LOCK(ctx);
 	IFDI_RESUME(ctx);
 	iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 
 	return (bus_generic_resume(dev));
 }
 
 int
 iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_INIT(ctx, num_vfs, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 void
 iflib_device_iov_uninit(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_IOV_UNINIT(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 int
 iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 /*********************************************************************
  *
  *  MODULE FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 /*
  * - Start a fast taskqueue thread for each core
  * - Start a taskqueue for control operations
  */
 static int
 iflib_module_init(void)
 {
 	return (0);
 }
 
 static int
 iflib_module_event_handler(module_t mod, int what, void *arg)
 {
 	int err;
 
 	switch (what) {
 	case MOD_LOAD:
 		if ((err = iflib_module_init()) != 0)
 			return (err);
 		break;
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 /*********************************************************************
  *
  *  PUBLIC FUNCTION DEFINITIONS
  *     ordered as in iflib.h
  *
  **********************************************************************/
 
 
 static void
 _iflib_assert(if_shared_ctx_t sctx)
 {
 	MPASS(sctx->isc_tx_maxsize);
 	MPASS(sctx->isc_tx_maxsegsize);
 
 	MPASS(sctx->isc_rx_maxsize);
 	MPASS(sctx->isc_rx_nsegments);
 	MPASS(sctx->isc_rx_maxsegsize);
 
 	MPASS(sctx->isc_nrxd_min[0]);
 	MPASS(sctx->isc_nrxd_max[0]);
 	MPASS(sctx->isc_nrxd_default[0]);
 	MPASS(sctx->isc_ntxd_min[0]);
 	MPASS(sctx->isc_ntxd_max[0]);
 	MPASS(sctx->isc_ntxd_default[0]);
 }
 
 static void
 _iflib_pre_assert(if_softc_ctx_t scctx)
 {
 
 	MPASS(scctx->isc_txrx->ift_txd_encap);
 	MPASS(scctx->isc_txrx->ift_txd_flush);
 	MPASS(scctx->isc_txrx->ift_txd_credits_update);
 	MPASS(scctx->isc_txrx->ift_rxd_available);
 	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
 	MPASS(scctx->isc_txrx->ift_rxd_refill);
 	MPASS(scctx->isc_txrx->ift_rxd_flush);
 }
 
 static int
 iflib_register(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	driver_t *driver = sctx->isc_driver;
 	device_t dev = ctx->ifc_dev;
 	if_t ifp;
 
 	_iflib_assert(sctx);
 
 	CTX_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
 
 	ifp = ctx->ifc_ifp = if_gethandle(IFT_ETHER);
 	if (ifp == NULL) {
 		device_printf(dev, "can not allocate ifnet structure\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * Initialize our context's device specific methods
 	 */
 	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
 	kobj_class_compile((kobj_class_t) driver);
 	driver->refs++;
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	if_setsoftc(ifp, ctx);
 	if_setdev(ifp, dev);
 	if_setinitfn(ifp, iflib_if_init);
 	if_setioctlfn(ifp, iflib_if_ioctl);
 	if_settransmitfn(ifp, iflib_if_transmit);
 	if_setqflushfn(ifp, iflib_if_qflush);
 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 
 	ctx->ifc_vlan_attach_event =
 		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 	ctx->ifc_vlan_detach_event =
 		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 
 	ifmedia_init(&ctx->ifc_media, IFM_IMASK,
 					 iflib_media_change, iflib_media_status);
 
 	return (0);
 }
 
 
 static int
 iflib_queues_alloc(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int nrxqsets = scctx->isc_nrxqsets;
 	int ntxqsets = scctx->isc_ntxqsets;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	iflib_fl_t fl = NULL;
 	int i, j, cpu, err, txconf, rxconf;
 	iflib_dma_info_t ifdip;
 	uint32_t *rxqsizes = scctx->isc_rxqsizes;
 	uint32_t *txqsizes = scctx->isc_txqsizes;
 	uint8_t nrxqs = sctx->isc_nrxqs;
 	uint8_t ntxqs = sctx->isc_ntxqs;
 	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
 	caddr_t *vaddrs;
 	uint64_t *paddrs;
 	struct ifmp_ring **brscp;
 	int nbuf_rings = 1; /* XXX determine dynamically */
 
 	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
 
 	brscp = NULL;
 	txq = NULL;
 	rxq = NULL;
 
 /* Allocate the TX ring struct memory */
 	if (!(txq =
 	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
 	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX ring memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/* Now allocate the RX */
 	if (!(rxq =
 	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
 	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate RX ring memory\n");
 		err = ENOMEM;
 		goto rx_fail;
 	}
 	if (!(brscp = malloc(sizeof(void *) * nbuf_rings * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to buf_ring_sc * memory\n");
 		err = ENOMEM;
 		goto rx_fail;
 	}
 
 	ctx->ifc_txqs = txq;
 	ctx->ifc_rxqs = rxq;
 
 	/*
 	 * XXX handle allocation failure
 	 */
 	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
 			device_printf(dev, "failed to allocate iflib_dma_info\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		txq->ift_ifdi = ifdip;
 		for (j = 0; j < ntxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
 				device_printf(dev, "Unable to allocate Descriptor memory\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
 		}
 		txq->ift_ctx = ctx;
 		txq->ift_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
 			txq->ift_br_offset = 1;
 		} else {
 			txq->ift_br_offset = 0;
 		}
 		/* XXX fix this */
 		txq->ift_timer.c_cpu = cpu;
 		txq->ift_db_check.c_cpu = cpu;
 		txq->ift_nbr = nbuf_rings;
 
 		if (iflib_txsd_alloc(txq)) {
 			device_printf(dev, "Critical Failure setting up TX buffers\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		/* Initialize the TX lock */
 		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:tx(%d):callout",
 		    device_get_nameunit(dev), txq->ift_id);
 		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
 		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
 		callout_init_mtx(&txq->ift_db_check, &txq->ift_mtx, 0);
 
 		snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db",
 			 device_get_nameunit(dev), txq->ift_id);
 		TXDB_LOCK_INIT(txq);
 
 		txq->ift_br = brscp + i*nbuf_rings;
 		for (j = 0; j < nbuf_rings; j++) {
 			err = ifmp_ring_alloc(&txq->ift_br[j], 2048, txq, iflib_txq_drain,
 					      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
 			if (err) {
 				/* XXX free any allocated rings */
 				device_printf(dev, "Unable to allocate buf_ring\n");
 				goto err_tx_desc;
 			}
 		}
 	}
 
 	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
 			device_printf(dev, "failed to allocate iflib_dma_info\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		rxq->ifr_ifdi = ifdip;
 		for (j = 0; j < nrxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
 				device_printf(dev, "Unable to allocate Descriptor memory\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
 		}
 		rxq->ifr_ctx = ctx;
 		rxq->ifr_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			rxq->ifr_fl_offset = 1;
 		} else {
 			rxq->ifr_fl_offset = 0;
 		}
 		rxq->ifr_nfl = nfree_lists;
 		if (!(fl =
 			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate free list memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		rxq->ifr_fl = fl;
 		for (j = 0; j < nfree_lists; j++) {
 			rxq->ifr_fl[j].ifl_rxq = rxq;
 			rxq->ifr_fl[j].ifl_id = j;
 			rxq->ifr_fl[j].ifl_ifdi =
 			    &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
 		}
         /* Allocate receive buffers for the ring*/
 		if (iflib_rxsd_alloc(rxq)) {
 			device_printf(dev,
 			    "Critical Failure setting up receive buffers\n");
 			err = ENOMEM;
 			goto err_rx_desc;
 		}
 	}
 
 	/* TXQs */
 	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < ntxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
 
 		for (j = 0; j < ntxqs; j++, di++) {
 			vaddrs[i*ntxqs + j] = di->idi_vaddr;
 			paddrs[i*ntxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
 		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	/* RXQs */
 	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < nrxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
 
 		for (j = 0; j < nrxqs; j++, di++) {
 			vaddrs[i*nrxqs + j] = di->idi_vaddr;
 			paddrs[i*nrxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
 		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	return (0);
 
 /* XXX handle allocation failure changes */
 err_rx_desc:
 err_tx_desc:
 	if (ctx->ifc_rxqs != NULL)
 		free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 	if (ctx->ifc_txqs != NULL)
 		free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 rx_fail:
 	if (brscp != NULL)
 		free(brscp, M_IFLIB);
 	if (rxq != NULL)
 		free(rxq, M_IFLIB);
 	if (txq != NULL)
 		free(txq, M_IFLIB);
 fail:
 	return (err);
 }
 
 static int
 iflib_tx_structures_setup(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_setup(txq);
 
 	return (0);
 }
 
 static void
 iflib_tx_structures_free(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i, j;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		iflib_txq_destroy(txq);
 		for (j = 0; j < ctx->ifc_nhwtxqs; j++)
 			iflib_dma_free(&txq->ift_ifdi[j]);
 	}
 	free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 	IFDI_QUEUES_FREE(ctx);
 }
 
 /*********************************************************************
  *
  *  Initialize all receive rings.
  *
  **********************************************************************/
 static int
 iflib_rx_structures_setup(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	int q;
 #if defined(INET6) || defined(INET)
 	int i, err;
 #endif
 
 	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
 #if defined(INET6) || defined(INET)
 		tcp_lro_free(&rxq->ifr_lc);
 		if ((err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
 		    TCP_LRO_ENTRIES, min(1024,
 		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]))) != 0) {
 			device_printf(ctx->ifc_dev, "LRO Initialization failed!\n");
 			goto fail;
 		}
 		rxq->ifr_lro_enabled = TRUE;
 #endif
 		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
 	}
 	return (0);
 #if defined(INET6) || defined(INET)
 fail:
 	/*
 	 * Free RX software descriptors allocated so far, we will only handle
 	 * the rings that completed, the failing case will have
 	 * cleaned up for itself. 'q' failed, so its the terminus.
 	 */
 	rxq = ctx->ifc_rxqs;
 	for (i = 0; i < q; ++i, rxq++) {
 		iflib_rx_sds_free(rxq);
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 	}
 	return (err);
 #endif
 }
 
 /*********************************************************************
  *
  *  Free all receive rings.
  *
  **********************************************************************/
 static void
 iflib_rx_structures_free(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
 		iflib_rx_sds_free(rxq);
 	}
 }
 
 static int
 iflib_qset_structures_setup(if_ctx_t ctx)
 {
 	int err;
 
 	if ((err = iflib_tx_structures_setup(ctx)) != 0)
 		return (err);
 
 	if ((err = iflib_rx_structures_setup(ctx)) != 0) {
 		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
 		iflib_tx_structures_free(ctx);
 		iflib_rx_structures_free(ctx);
 	}
 	return (err);
 }
 
 int
 iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 				driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, char *name)
 {
 
 	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
 }
 
 static int
 find_nth(if_ctx_t ctx, cpuset_t *cpus, int qid)
 {
 	int i, cpuid, eqid, count;
 
 	CPU_COPY(&ctx->ifc_cpus, cpus);
 	count = CPU_COUNT(&ctx->ifc_cpus);
 	eqid = qid % count;
 	/* clear up to the qid'th bit */
 	for (i = 0; i < eqid; i++) {
 		cpuid = CPU_FFS(cpus);
 		MPASS(cpuid != 0);
 		CPU_CLR(cpuid-1, cpus);
 	}
 	cpuid = CPU_FFS(cpus);
 	MPASS(cpuid != 0);
 	return (cpuid-1);
 }
 
 int
 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 						iflib_intr_type_t type, driver_filter_t *filter,
 						void *filter_arg, int qid, char *name)
 {
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	iflib_filter_info_t info;
 	cpuset_t cpus;
 	gtask_fn_t *fn;
 	int tqrid, err, cpuid;
 	void *q;
 
 	info = &ctx->ifc_filter_info;
 	tqrid = rid;
 
 	switch (type) {
 	/* XXX merge tx/rx for netmap? */
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		info = &ctx->ifc_txqs[qid].ift_filter_info;
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_ADMIN:
 		q = ctx;
 		tqrid = -1;
 		info = &ctx->ifc_filter_info;
 		gtask = &ctx->ifc_admin_task;
 		tqg = qgroup_if_config_tqg;
 		fn = _task_fn_admin;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 	info->ifi_ctx = ctx;
 
 	err = _iflib_irq_alloc(ctx, irq, rid, iflib_fast_intr, NULL, info,  name);
 	if (err != 0) {
 		device_printf(ctx->ifc_dev, "_iflib_irq_alloc failed %d\n", err);
 		return (err);
 	}
 	if (type == IFLIB_INTR_ADMIN)
 		return (0);
 
 	if (tqrid != -1) {
 		cpuid = find_nth(ctx, &cpus, qid);
 		taskqgroup_attach_cpu(tqg, gtask, q, cpuid, irq->ii_rid, name);
 	} else {
 		taskqgroup_attach(tqg, gtask, q, tqrid, name);
 	}
 
 	return (0);
 }
 
 void
 iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type,  void *arg, int qid, char *name)
 {
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	void *q;
 
 	switch (type) {
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		break;
 	case IFLIB_INTR_IOV:
 		q = ctx;
 		gtask = &ctx->ifc_vflr_task;
 		tqg = qgroup_if_config_tqg;
 		rid = -1;
 		fn = _task_fn_iov;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 	GROUPTASK_INIT(gtask, 0, fn, q);
 	taskqgroup_attach(tqg, gtask, q, rid, name);
 }
 
 void
 iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
 {
 	if (irq->ii_tag)
 		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
 
 	if (irq->ii_res)
 		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ, irq->ii_rid, irq->ii_res);
 }
 
 static int
 iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, char *name)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_irq_t irq = &ctx->ifc_legacy_irq;
 	iflib_filter_info_t info;
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	int tqrid;
 	void *q;
 	int err;
 
 	q = &ctx->ifc_rxqs[0];
 	info = &rxq[0].ifr_filter_info;
 	gtask = &rxq[0].ifr_task;
 	tqg = qgroup_if_io_tqg;
 	tqrid = irq->ii_rid = *rid;
 	fn = _task_fn_rx;
 
 	ctx->ifc_flags |= IFC_LEGACY;
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 	info->ifi_ctx = ctx;
 
 	/* We allocate a single interrupt resource */
 	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr, NULL, info, name)) != 0)
 		return (err);
 	GROUPTASK_INIT(gtask, 0, fn, q);
 	taskqgroup_attach(tqg, gtask, q, tqrid, name);
 
 	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
 	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, tqrid, "tx");
 	return (0);
 }
 
 void
 iflib_led_create(if_ctx_t ctx)
 {
 
 	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
 								  device_get_nameunit(ctx->ifc_dev));
 }
 
 void
 iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
 }
 
 void
 iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
 }
 
 void
 iflib_admin_intr_deferred(if_ctx_t ctx)
 {
 #ifdef INVARIANTS
 	struct grouptask *gtask;
 
 	gtask = &ctx->ifc_admin_task;
 	MPASS(gtask->gt_taskqueue != NULL);
 #endif
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
 }
 
 void
 iflib_iov_intr_deferred(if_ctx_t ctx)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
 }
 
 void
 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name)
 {
 
 	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, -1, name);
 }
 
 void
 iflib_config_gtask_init(if_ctx_t ctx, struct grouptask *gtask, gtask_fn_t *fn,
 	char *name)
 {
 
 	GROUPTASK_INIT(gtask, 0, fn, ctx);
 	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, -1, name);
 }
 
 void
 iflib_config_gtask_deinit(struct grouptask *gtask)
 {
 
 	taskqgroup_detach(qgroup_if_config_tqg, gtask);	
 }
 
 void
 iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	if_setbaudrate(ifp, baudrate);
 
 	/* If link down, disable watchdog */
 	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
 		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
 			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	}
 	ctx->ifc_link_state = link_state;
 	if_link_state_change(ifp, link_state);
 }
 
 static int
 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
 {
 	int credits;
 #ifdef INVARIANTS
 	int credits_pre = txq->ift_cidx_processed;
 #endif	
 
 	if (ctx->isc_txd_credits_update == NULL)
 		return (0);
 
 	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, true)) == 0)
 		return (0);
 
 	txq->ift_processed += credits;
 	txq->ift_cidx_processed += credits;
 
 	MPASS(credits_pre + credits == txq->ift_cidx_processed);
 	if (txq->ift_cidx_processed >= txq->ift_size)
 		txq->ift_cidx_processed -= txq->ift_size;
 	return (credits);
 }
 
 static int
 iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget)
 {
 
 	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
 	    budget));
 }
 
 void
 iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
 	const char *description, if_int_delay_info_t info,
 	int offset, int value)
 {
 	info->iidi_ctx = ctx;
 	info->iidi_offset = offset;
 	info->iidi_value = value;
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
 	    OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW,
 	    info, 0, iflib_sysctl_int_delay, "I", description);
 }
 
 struct mtx *
 iflib_ctx_lock_get(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_mtx);
 }
 
 static int
 iflib_msix_init(if_ctx_t ctx)
 {
 	device_t dev = ctx->ifc_dev;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int vectors, queues, rx_queues, tx_queues, queuemsgs, msgs;
 	int iflib_num_tx_queues, iflib_num_rx_queues;
 	int err, admincnt, bar;
 
 	iflib_num_tx_queues = scctx->isc_ntxqsets;
 	iflib_num_rx_queues = scctx->isc_nrxqsets;
 
 	device_printf(dev, "msix_init qsets capped at %d\n", iflib_num_tx_queues);
 	
 	bar = ctx->ifc_softc_ctx.isc_msix_bar;
 	admincnt = sctx->isc_admin_intrcnt;
 	/* Override by tuneable */
 	if (enable_msix == 0)
 		goto msi;
 
 	/*
 	** When used in a virtualized environment
 	** PCI BUSMASTER capability may not be set
 	** so explicity set it here and rewrite
 	** the ENABLE in the MSIX control register
 	** at this point to cause the host to
 	** successfully initialize us.
 	*/
 	{
 		int msix_ctrl, rid;
 
  		pci_enable_busmaster(dev);
 		rid = 0;
 		if (pci_find_cap(dev, PCIY_MSIX, &rid) == 0 && rid != 0) {
 			rid += PCIR_MSIX_CTRL;
 			msix_ctrl = pci_read_config(dev, rid, 2);
 			msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
 			pci_write_config(dev, rid, msix_ctrl, 2);
 		} else {
 			device_printf(dev, "PCIY_MSIX capability not found; "
 			                   "or rid %d == 0.\n", rid);
 			goto msi;
 		}
 	}
 
 	/*
 	 * bar == -1 => "trust me I know what I'm doing"
 	 * https://www.youtube.com/watch?v=nnwWKkNau4I
 	 * Some drivers are for hardware that is so shoddily
 	 * documented that no one knows which bars are which
 	 * so the developer has to map all bars. This hack
 	 * allows shoddy garbage to use msix in this framework.
 	 */
 	if (bar != -1) {
 		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
 	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
 		if (ctx->ifc_msix_mem == NULL) {
 			/* May not be enabled */
 			device_printf(dev, "Unable to map MSIX table \n");
 			goto msi;
 		}
 	}
 	/* First try MSI/X */
 	if ((msgs = pci_msix_count(dev)) == 0) { /* system has msix disabled */
 		device_printf(dev, "System has MSIX disabled \n");
 		bus_release_resource(dev, SYS_RES_MEMORY,
 		    bar, ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 		goto msi;
 	}
 #if IFLIB_DEBUG
 	/* use only 1 qset in debug mode */
 	queuemsgs = min(msgs - admincnt, 1);
 #else
 	queuemsgs = msgs - admincnt;
 #endif
 	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) == 0) {
 #ifdef RSS
 		queues = imin(queuemsgs, rss_getnumbuckets());
 #else
 		queues = queuemsgs;
 #endif
 		queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
 		device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n",
 					  CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
 	} else {
 		device_printf(dev, "Unable to fetch CPU list\n");
 		/* Figure out a reasonable auto config value */
 		queues = min(queuemsgs, mp_ncpus);
 	}
 #ifdef  RSS
 	/* If we're doing RSS, clamp at the number of RSS buckets */
 	if (queues > rss_getnumbuckets())
 		queues = rss_getnumbuckets();
 #endif
 	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
 		rx_queues = iflib_num_rx_queues;
 	else
 		rx_queues = queues;
 	/*
 	 * We want this to be all logical CPUs by default
 	 */
 	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
 		tx_queues = iflib_num_tx_queues;
 	else
 		tx_queues = mp_ncpus;
 
 	if (ctx->ifc_sysctl_qs_eq_override == 0) {
 #ifdef INVARIANTS
 		if (tx_queues != rx_queues)
 			device_printf(dev, "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
 				      min(rx_queues, tx_queues), min(rx_queues, tx_queues));
 #endif
 		tx_queues = min(rx_queues, tx_queues);
 		rx_queues = min(rx_queues, tx_queues);
 	}
 
 	device_printf(dev, "using %d rx queues %d tx queues \n", rx_queues, tx_queues);
 
 	vectors = rx_queues + admincnt;
 	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
 		device_printf(dev,
 					  "Using MSIX interrupts with %d vectors\n", vectors);
 		scctx->isc_vectors = vectors;
 		scctx->isc_nrxqsets = rx_queues;
 		scctx->isc_ntxqsets = tx_queues;
 		scctx->isc_intr = IFLIB_INTR_MSIX;
 
 		return (vectors);
 	} else {
 		device_printf(dev, "failed to allocate %d msix vectors, err: %d - using MSI\n", vectors, err);
 	}
 msi:
 	vectors = pci_msi_count(dev);
 	scctx->isc_nrxqsets = 1;
 	scctx->isc_ntxqsets = 1;
 	scctx->isc_vectors = vectors;
 	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
 		device_printf(dev,"Using an MSI interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_MSI;
 	} else {
 		device_printf(dev,"Using a Legacy interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 	}
 
 	return (vectors);
 }
 
 char * ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
 
 static int
 mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
 	struct sbuf *sb;
 	char *ring_state = "UNKNOWN";
 
 	/* XXX needed ? */
 	rc = sysctl_wire_old_buffer(req, 0);
 	MPASS(rc == 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
 	MPASS(sb != NULL);
 	if (sb == NULL)
 		return (ENOMEM);
 	if (state[3] <= 3)
 		ring_state = ring_states[state[3]];
 
 	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
 		    state[0], state[1], state[2], ring_state);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
         return(rc);
 }
 
 enum iflib_ndesc_handler {
 	IFLIB_NTXD_HANDLER,
 	IFLIB_NRXD_HANDLER,
 };
 
 static int
 mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
 {
 	if_ctx_t ctx = (void *)arg1;
 	enum iflib_ndesc_handler type = arg2;
 	char buf[256] = {0};
 	uint16_t *ndesc;
 	char *p, *next;
 	int nqs, rc, i;
 
 	MPASS(type == IFLIB_NTXD_HANDLER || type == IFLIB_NRXD_HANDLER);
 
 	nqs = 8;
 	switch(type) {
 	case IFLIB_NTXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_ntxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_ntxqs;
 		break;
 	case IFLIB_NRXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_nrxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_nrxqs;
 		break;
 	}
 	if (nqs == 0)
 		nqs = 8;
 
 	for (i=0; i<8; i++) {
 		if (i >= nqs)
 			break;
 		if (i)
 			strcat(buf, ",");
 		sprintf(strchr(buf, 0), "%d", ndesc[i]);
 	}
 
 	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (rc || req->newptr == NULL)
 		return rc;
 
 	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
 	    i++, p = strsep(&next, " ,")) {
 		ndesc[i] = strtoul(p, NULL, 10);
 	}
 
 	return(rc);
 }
 
 #define NAME_BUFLEN 32
 static void
 iflib_add_device_sysctl_pre(if_ctx_t ctx)
 {
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child, *oid_list;
 	struct sysctl_ctx_list *ctx_list;
 	struct sysctl_oid *node;
 
 	ctx_list = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
 						      CTLFLAG_RD, NULL, "IFLIB fields");
 	oid_list = SYSCTL_CHILDREN(node);
 
 	SYSCTL_ADD_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
 		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, 0,
 		       "driver version");
 
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 			"# of txqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
 			"# of rxqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
                        "permit #txq != #rxq");
 
 	/* XXX change for per-queue sizes */
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER,
                        mp_ndesc_handler, "A",
                        "list of # of tx descriptors to use, 0 = use default #");
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER,
                        mp_ndesc_handler, "A",
                        "list of # of rx descriptors to use, 0 = use default #");
 }
 
 static void
 iflib_add_device_sysctl_post(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx_list;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j;
 	char namebuf[NAME_BUFLEN];
 	char *qfmt;
 	struct sysctl_oid *queue_node, *fl_node, *node;
 	struct sysctl_oid_list *queue_list, *fl_list;
 	ctx_list = device_get_sysctl_ctx(dev);
 
 	node = ctx->ifc_sysctl_node;
 	child = SYSCTL_CHILDREN(node);
 
 	if (scctx->isc_ntxqsets > 100)
 		qfmt = "txq%03d";
 	else if (scctx->isc_ntxqsets > 10)
 		qfmt = "txq%02d";
 	else
 		qfmt = "txq%d";
 	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 					     CTLFLAG_RD, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 #if MEMORY_LOGGING
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
 				CTLFLAG_RD,
 				&txq->ift_dequeued, "total mbufs freed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
 				CTLFLAG_RD,
 				&txq->ift_enqueued, "total mbufs enqueued");
 #endif
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
 				   CTLFLAG_RD,
 				   &txq->ift_pullups, "# of times m_pullup was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
 				   CTLFLAG_RD,
 				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_map_failed, "# of times dma map failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
 				   CTLFLAG_RD,
 				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
 				   CTLFLAG_RD,
 				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
 				   CTLFLAG_RD,
 				   &txq->ift_pidx, 1, "Producer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx, 1, "Consumer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
 				   CTLFLAG_RD,
 				   &txq->ift_in_use, 1, "descriptors in use");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_processed, "descriptors procesed for clean");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
 				   CTLFLAG_RD,
 				   &txq->ift_cleaned, "total cleaned");
 		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
 				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br[0]->state),
 				0, mp_ring_state_handler, "A", "soft ring state");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
 				       CTLFLAG_RD, &txq->ift_br[0]->enqueues,
 				       "# of enqueues to the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
 				       CTLFLAG_RD, &txq->ift_br[0]->drops,
 				       "# of drops in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
 				       CTLFLAG_RD, &txq->ift_br[0]->starts,
 				       "# of normal consumer starts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
 				       CTLFLAG_RD, &txq->ift_br[0]->stalls,
 					       "# of consumer stalls in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
 			       CTLFLAG_RD, &txq->ift_br[0]->restarts,
 				       "# of consumer restarts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
 				       CTLFLAG_RD, &txq->ift_br[0]->abdications,
 				       "# of consumer abdications in the mp_ring for this queue");
 	}
 
 	if (scctx->isc_nrxqsets > 100)
 		qfmt = "rxq%03d";
 	else if (scctx->isc_nrxqsets > 10)
 		qfmt = "rxq%02d";
 	else
 		qfmt = "rxq%d";
 	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 					     CTLFLAG_RD, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_pidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
 		}
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
 			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
 						     CTLFLAG_RD, NULL, "freelist Name");
 			fl_list = SYSCTL_CHILDREN(fl_node);
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_cidx, 1, "Consumer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
 				       CTLFLAG_RD,
 				       &fl->ifl_credits, 1, "credits available");
 #if MEMORY_LOGGING
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_m_enqueued, "mbufs allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_m_dequeued, "mbufs freed");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_enqueued, "clusters allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_dequeued, "clusters freed");
 #endif
 
 		}
 	}
 
 }
Index: head/sys/netpfil/ipfw/ip_fw_sockopt.c
===================================================================
--- head/sys/netpfil/ipfw/ip_fw_sockopt.c	(revision 314067)
+++ head/sys/netpfil/ipfw/ip_fw_sockopt.c	(revision 314068)
@@ -1,4613 +1,4613 @@
 /*-
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  * Copyright (c) 2014 Yandex LLC
  * Copyright (c) 2014 Alexander V. Chernikov
  *
  * Supported by: Valeria Paoli
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Control socket and rule management routines for ipfw.
  * Control is currently implemented via IP_FW3 setsockopt() code.
  */
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>	/* struct m_tag used by nested headers */
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/fnv_hash.h>
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h> /* hooks */
 #include <netinet/ip_fw.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/ip_fw_table.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 static int ipfw_ctl(struct sockopt *sopt);
 static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len,
     struct rule_check_info *ci);
 static int check_ipfw_rule1(struct ip_fw_rule *rule, int size,
     struct rule_check_info *ci);
 static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size,
     struct rule_check_info *ci);
 static int rewrite_rule_uidx(struct ip_fw_chain *chain,
     struct rule_check_info *ci);
 
 #define	NAMEDOBJ_HASH_SIZE	32
 
 struct namedobj_instance {
 	struct namedobjects_head	*names;
 	struct namedobjects_head	*values;
 	uint32_t nn_size;		/* names hash size */
 	uint32_t nv_size;		/* number hash size */
 	u_long *idx_mask;		/* used items bitmask */
 	uint32_t max_blocks;		/* number of "long" blocks in bitmask */
 	uint32_t count;			/* number of items */
 	uint16_t free_off[IPFW_MAX_SETS];	/* first possible free offset */
 	objhash_hash_f	*hash_f;
 	objhash_cmp_f	*cmp_f;
 };
 #define	BLOCK_ITEMS	(8 * sizeof(u_long))	/* Number of items for ffsl() */
 
 static uint32_t objhash_hash_name(struct namedobj_instance *ni,
     const void *key, uint32_t kopt);
 static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val);
 static int objhash_cmp_name(struct named_object *no, const void *name,
     uint32_t set);
 
 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
 
 static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 
 /* ctl3 handler data */
 struct mtx ctl3_lock;
 #define	CTL3_LOCK_INIT()	mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF)
 #define	CTL3_LOCK_DESTROY()	mtx_destroy(&ctl3_lock)
 #define	CTL3_LOCK()		mtx_lock(&ctl3_lock)
 #define	CTL3_UNLOCK()		mtx_unlock(&ctl3_lock)
 
 static struct ipfw_sopt_handler *ctl3_handlers;
 static size_t ctl3_hsize;
 static uint64_t ctl3_refct, ctl3_gencnt;
 #define	CTL3_SMALLBUF	4096			/* small page-size write buffer */
 #define	CTL3_LARGEBUF	16 * 1024 * 1024	/* handle large rulesets */
 
 static int ipfw_flush_sopt_data(struct sockopt_data *sd);
 
 static struct ipfw_sopt_handler	scodes[] = {
 	{ IP_FW_XGET,		0,	HDIR_GET,	dump_config },
 	{ IP_FW_XADD,		0,	HDIR_BOTH,	add_rules },
 	{ IP_FW_XDEL,		0,	HDIR_BOTH,	del_rules },
 	{ IP_FW_XZERO,		0,	HDIR_SET,	clear_rules },
 	{ IP_FW_XRESETLOG,	0,	HDIR_SET,	clear_rules },
 	{ IP_FW_XMOVE,		0,	HDIR_SET,	move_rules },
 	{ IP_FW_SET_SWAP,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_SET_MOVE,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_SET_ENABLE,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_DUMP_SOPTCODES,	0,	HDIR_GET,	dump_soptcodes },
 	{ IP_FW_DUMP_SRVOBJECTS,0,	HDIR_GET,	dump_srvobjects },
 };
 
 static int
 set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule);
 static struct opcode_obj_rewrite *find_op_rw(ipfw_insn *cmd,
     uint16_t *puidx, uint8_t *ptype);
 static int mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule,
     uint32_t *bmask);
 static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
     struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti);
 static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct tid_info *ti, struct obj_idx *pidx, int *unresolved);
 static void unref_rule_objects(struct ip_fw_chain *chain, struct ip_fw *rule);
 static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct obj_idx *oib, struct obj_idx *end);
 static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx,
     struct sockopt_data *sd);
 
 /*
  * Opcode object rewriter variables
  */
 struct opcode_obj_rewrite *ctl3_rewriters;
 static size_t ctl3_rsize;
 
 /*
  * static variables followed by global ones
  */
 
 static VNET_DEFINE(uma_zone_t, ipfw_cntr_zone);
 #define	V_ipfw_cntr_zone		VNET(ipfw_cntr_zone)
 
 void
 ipfw_init_counters()
 {
 
 	V_ipfw_cntr_zone = uma_zcreate("IPFW counters",
 	    IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_PCPU);
 }
 
 void
 ipfw_destroy_counters()
 {
 	
 	uma_zdestroy(V_ipfw_cntr_zone);
 }
 
 struct ip_fw *
 ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize)
 {
 	struct ip_fw *rule;
 
 	rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO);
 	rule->cntr = uma_zalloc(V_ipfw_cntr_zone, M_WAITOK | M_ZERO);
 
 	return (rule);
 }
 
 static void
 free_rule(struct ip_fw *rule)
 {
 
 	uma_zfree(V_ipfw_cntr_zone, rule->cntr);
 	free(rule, M_IPFW);
 }
 
 
 /*
  * Find the smallest rule >= key, id.
  * We could use bsearch but it is so simple that we code it directly
  */
 int
 ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
 {
 	int i, lo, hi;
 	struct ip_fw *r;
 
   	for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
 		i = (lo + hi) / 2;
 		r = chain->map[i];
 		if (r->rulenum < key)
 			lo = i + 1;	/* continue from the next one */
 		else if (r->rulenum > key)
 			hi = i;		/* this might be good */
 		else if (r->id < id)
 			lo = i + 1;	/* continue from the next one */
 		else /* r->id >= id */
 			hi = i;		/* this might be good */
 	}
 	return hi;
 }
 
 /*
  * Builds skipto cache on rule set @map.
  */
 static void
 update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map)
 {
 	int *smap, rulenum;
 	int i, mi;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	mi = 0;
 	rulenum = map[mi]->rulenum;
 	smap = chain->idxmap_back;
 
 	if (smap == NULL)
 		return;
 
 	for (i = 0; i < 65536; i++) {
 		smap[i] = mi;
 		/* Use the same rule index until i < rulenum */
 		if (i != rulenum || i == 65535)
 			continue;
 		/* Find next rule with num > i */
 		rulenum = map[++mi]->rulenum;
 		while (rulenum == i)
 			rulenum = map[++mi]->rulenum;
 	}
 }
 
 /*
  * Swaps prepared (backup) index with current one.
  */
 static void
 swap_skipto_cache(struct ip_fw_chain *chain)
 {
 	int *map;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 	IPFW_WLOCK_ASSERT(chain);
 
 	map = chain->idxmap;
 	chain->idxmap = chain->idxmap_back;
 	chain->idxmap_back = map;
 }
 
 /*
  * Allocate and initialize skipto cache.
  */
 void
 ipfw_init_skipto_cache(struct ip_fw_chain *chain)
 {
 	int *idxmap, *idxmap_back;
 
 	idxmap = malloc(65536 * sizeof(uint32_t *), M_IPFW,
 	    M_WAITOK | M_ZERO);
 	idxmap_back = malloc(65536 * sizeof(uint32_t *), M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	/*
 	 * Note we may be called at any time after initialization,
 	 * for example, on first skipto rule, so we need to
 	 * provide valid chain->idxmap on return
 	 */
 
 	IPFW_UH_WLOCK(chain);
 	if (chain->idxmap != NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		free(idxmap, M_IPFW);
 		free(idxmap_back, M_IPFW);
 		return;
 	}
 
 	/* Set backup pointer first to permit building cache */
 	chain->idxmap_back = idxmap_back;
 	update_skipto_cache(chain, chain->map);
 	IPFW_WLOCK(chain);
 	/* It is now safe to set chain->idxmap ptr */
 	chain->idxmap = idxmap;
 	swap_skipto_cache(chain);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 }
 
 /*
  * Destroys skipto cache.
  */
 void
 ipfw_destroy_skipto_cache(struct ip_fw_chain *chain)
 {
 
 	if (chain->idxmap != NULL)
 		free(chain->idxmap, M_IPFW);
 	if (chain->idxmap != NULL)
 		free(chain->idxmap_back, M_IPFW);
 }
 
 
 /*
  * allocate a new map, returns the chain locked. extra is the number
  * of entries to add or delete.
  */
 static struct ip_fw **
 get_map(struct ip_fw_chain *chain, int extra, int locked)
 {
 
 	for (;;) {
 		struct ip_fw **map;
 		int i, mflags;
 
 		mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK);
 
 		i = chain->n_rules + extra;
 		map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags);
 		if (map == NULL) {
 			printf("%s: cannot allocate map\n", __FUNCTION__);
 			return NULL;
 		}
 		if (!locked)
 			IPFW_UH_WLOCK(chain);
 		if (i >= chain->n_rules + extra) /* good */
 			return map;
 		/* otherwise we lost the race, free and retry */
 		if (!locked)
 			IPFW_UH_WUNLOCK(chain);
 		free(map, M_IPFW);
 	}
 }
 
 /*
  * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
  */
 static struct ip_fw **
 swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
 {
 	struct ip_fw **old_map;
 
 	IPFW_WLOCK(chain);
 	chain->id++;
 	chain->n_rules = new_len;
 	old_map = chain->map;
 	chain->map = new_map;
 	swap_skipto_cache(chain);
 	IPFW_WUNLOCK(chain);
 	return old_map;
 }
 
 
 static void
 export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr)
 {
 	struct timeval boottime;
 
 	cntr->size = sizeof(*cntr);
 
 	if (krule->cntr != NULL) {
 		cntr->pcnt = counter_u64_fetch(krule->cntr);
 		cntr->bcnt = counter_u64_fetch(krule->cntr + 1);
 		cntr->timestamp = krule->timestamp;
 	}
 	if (cntr->timestamp > 0) {
 		getboottime(&boottime);
 		cntr->timestamp += boottime.tv_sec;
 	}
 }
 
 static void
 export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr)
 {
 	struct timeval boottime;
 
 	if (krule->cntr != NULL) {
 		cntr->pcnt = counter_u64_fetch(krule->cntr);
 		cntr->bcnt = counter_u64_fetch(krule->cntr + 1);
 		cntr->timestamp = krule->timestamp;
 	}
 	if (cntr->timestamp > 0) {
 		getboottime(&boottime);
 		cntr->timestamp += boottime.tv_sec;
 	}
 }
 
 /*
  * Copies rule @urule from v1 userland format (current).
  * to kernel @krule.
  * Assume @krule is zeroed.
  */
 static void
 import_rule1(struct rule_check_info *ci)
 {
 	struct ip_fw_rule *urule;
 	struct ip_fw *krule;
 
 	urule = (struct ip_fw_rule *)ci->urule;
 	krule = (struct ip_fw *)ci->krule;
 
 	/* copy header */
 	krule->act_ofs = urule->act_ofs;
 	krule->cmd_len = urule->cmd_len;
 	krule->rulenum = urule->rulenum;
 	krule->set = urule->set;
 	krule->flags = urule->flags;
 
 	/* Save rulenum offset */
 	ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum);
 
 	/* Copy opcodes */
 	memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t));
 }
 
 /*
  * Export rule into v1 format (Current).
  * Layout:
  * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT)
  *     [ ip_fw_rule ] OR
  *     [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs).
  * ]
  * Assume @data is zeroed.
  */
 static void
 export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs)
 {
 	struct ip_fw_bcounter *cntr;
 	struct ip_fw_rule *urule;
 	ipfw_obj_tlv *tlv;
 
 	/* Fill in TLV header */
 	tlv = (ipfw_obj_tlv *)data;
 	tlv->type = IPFW_TLV_RULE_ENT;
 	tlv->length = len;
 
 	if (rcntrs != 0) {
 		/* Copy counters */
 		cntr = (struct ip_fw_bcounter *)(tlv + 1);
 		urule = (struct ip_fw_rule *)(cntr + 1);
 		export_cntr1_base(krule, cntr);
 	} else
 		urule = (struct ip_fw_rule *)(tlv + 1);
 
 	/* copy header */
 	urule->act_ofs = krule->act_ofs;
 	urule->cmd_len = krule->cmd_len;
 	urule->rulenum = krule->rulenum;
 	urule->set = krule->set;
 	urule->flags = krule->flags;
 	urule->id = krule->id;
 
 	/* Copy opcodes */
 	memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t));
 }
 
 
 /*
  * Copies rule @urule from FreeBSD8 userland format (v0)
  * to kernel @krule.
  * Assume @krule is zeroed.
  */
 static void
 import_rule0(struct rule_check_info *ci)
 {
 	struct ip_fw_rule0 *urule;
 	struct ip_fw *krule;
 	int cmdlen, l;
 	ipfw_insn *cmd;
 	ipfw_insn_limit *lcmd;
 	ipfw_insn_if *cmdif;
 
 	urule = (struct ip_fw_rule0 *)ci->urule;
 	krule = (struct ip_fw *)ci->krule;
 
 	/* copy header */
 	krule->act_ofs = urule->act_ofs;
 	krule->cmd_len = urule->cmd_len;
 	krule->rulenum = urule->rulenum;
 	krule->set = urule->set;
 	if ((urule->_pad & 1) != 0)
 		krule->flags |= IPFW_RULE_NOOPT;
 
 	/* Save rulenum offset */
 	ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum);
 
 	/* Copy opcodes */
 	memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t));
 
 	/*
 	 * Alter opcodes:
 	 * 1) convert tablearg value from 65535 to 0
 	 * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room
 	 *    for targ).
 	 * 3) convert table number in iface opcodes to u16
 	 * 4) convert old `nat global` into new 65535
 	 */
 	l = krule->cmd_len;
 	cmd = krule->cmd;
 	cmdlen = 0;
 
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		switch (cmd->opcode) {
 		/* Opcodes supporting tablearg */
 		case O_TAG:
 		case O_TAGGED:
 		case O_PIPE:
 		case O_QUEUE:
 		case O_DIVERT:
 		case O_TEE:
 		case O_SKIPTO:
 		case O_CALLRETURN:
 		case O_NETGRAPH:
 		case O_NGTEE:
 		case O_NAT:
 			if (cmd->arg1 == IP_FW_TABLEARG)
 				cmd->arg1 = IP_FW_TARG;
 			else if (cmd->arg1 == 0)
 				cmd->arg1 = IP_FW_NAT44_GLOBAL;
 			break;
 		case O_SETFIB:
 		case O_SETDSCP:
 			if (cmd->arg1 == IP_FW_TABLEARG)
 				cmd->arg1 = IP_FW_TARG;
 			else
 				cmd->arg1 |= 0x8000;
 			break;
 		case O_LIMIT:
 			lcmd = (ipfw_insn_limit *)cmd;
 			if (lcmd->conn_limit == IP_FW_TABLEARG)
 				lcmd->conn_limit = IP_FW_TARG;
 			break;
 		/* Interface tables */
 		case O_XMIT:
 		case O_RECV:
 		case O_VIA:
 			/* Interface table, possibly */
 			cmdif = (ipfw_insn_if *)cmd;
 			if (cmdif->name[0] != '\1')
 				break;
 
 			cmdif->p.kidx = (uint16_t)cmdif->p.glob;
 			break;
 		}
 	}
 }
 
 /*
  * Copies rule @krule from kernel to FreeBSD8 userland format (v0)
  */
 static void
 export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len)
 {
 	int cmdlen, l;
 	ipfw_insn *cmd;
 	ipfw_insn_limit *lcmd;
 	ipfw_insn_if *cmdif;
 
 	/* copy header */
 	memset(urule, 0, len);
 	urule->act_ofs = krule->act_ofs;
 	urule->cmd_len = krule->cmd_len;
 	urule->rulenum = krule->rulenum;
 	urule->set = krule->set;
 	if ((krule->flags & IPFW_RULE_NOOPT) != 0)
 		urule->_pad |= 1;
 
 	/* Copy opcodes */
 	memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t));
 
 	/* Export counters */
 	export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt);
 
 	/*
 	 * Alter opcodes:
 	 * 1) convert tablearg value from 0 to 65535
 	 * 2) Remove highest bit from O_SETFIB/O_SETDSCP values.
 	 * 3) convert table number in iface opcodes to int
 	 */
 	l = urule->cmd_len;
 	cmd = urule->cmd;
 	cmdlen = 0;
 
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		switch (cmd->opcode) {
 		/* Opcodes supporting tablearg */
 		case O_TAG:
 		case O_TAGGED:
 		case O_PIPE:
 		case O_QUEUE:
 		case O_DIVERT:
 		case O_TEE:
 		case O_SKIPTO:
 		case O_CALLRETURN:
 		case O_NETGRAPH:
 		case O_NGTEE:
 		case O_NAT:
 			if (cmd->arg1 == IP_FW_TARG)
 				cmd->arg1 = IP_FW_TABLEARG;
 			else if (cmd->arg1 == IP_FW_NAT44_GLOBAL)
 				cmd->arg1 = 0;
 			break;
 		case O_SETFIB:
 		case O_SETDSCP:
 			if (cmd->arg1 == IP_FW_TARG)
 				cmd->arg1 = IP_FW_TABLEARG;
 			else
 				cmd->arg1 &= ~0x8000;
 			break;
 		case O_LIMIT:
 			lcmd = (ipfw_insn_limit *)cmd;
 			if (lcmd->conn_limit == IP_FW_TARG)
 				lcmd->conn_limit = IP_FW_TABLEARG;
 			break;
 		/* Interface tables */
 		case O_XMIT:
 		case O_RECV:
 		case O_VIA:
 			/* Interface table, possibly */
 			cmdif = (ipfw_insn_if *)cmd;
 			if (cmdif->name[0] != '\1')
 				break;
 
 			cmdif->p.glob = cmdif->p.kidx;
 			break;
 		}
 	}
 }
 
 /*
  * Add new rule(s) to the list possibly creating rule number for each.
  * Update the rule_number in the input struct so the caller knows it as well.
  * Must be called without IPFW_UH held
  */
 static int
 commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count)
 {
 	int error, i, insert_before, tcount;
 	uint16_t rulenum, *pnum;
 	struct rule_check_info *ci;
 	struct ip_fw *krule;
 	struct ip_fw **map;	/* the new array of pointers */
 
 	/* Check if we need to do table/obj index remap */
 	tcount = 0;
 	for (ci = rci, i = 0; i < count; ci++, i++) {
 		if (ci->object_opcodes == 0)
 			continue;
 
 		/*
 		 * Rule has some object opcodes.
 		 * We need to find (and create non-existing)
 		 * kernel objects, and reference existing ones.
 		 */
 		error = rewrite_rule_uidx(chain, ci);
 		if (error != 0) {
 
 			/*
 			 * rewrite failed, state for current rule
 			 * has been reverted. Check if we need to
 			 * revert more.
 			 */
 			if (tcount > 0) {
 
 				/*
 				 * We have some more table rules
 				 * we need to rollback.
 				 */
 
 				IPFW_UH_WLOCK(chain);
 				while (ci != rci) {
 					ci--;
 					if (ci->object_opcodes == 0)
 						continue;
 					unref_rule_objects(chain,ci->krule);
 
 				}
 				IPFW_UH_WUNLOCK(chain);
 
 			}
 
 			return (error);
 		}
 
 		tcount++;
 	}
 
 	/* get_map returns with IPFW_UH_WLOCK if successful */
 	map = get_map(chain, count, 0 /* not locked */);
 	if (map == NULL) {
 		if (tcount > 0) {
 			/* Unbind tables */
 			IPFW_UH_WLOCK(chain);
 			for (ci = rci, i = 0; i < count; ci++, i++) {
 				if (ci->object_opcodes == 0)
 					continue;
 
 				unref_rule_objects(chain, ci->krule);
 			}
 			IPFW_UH_WUNLOCK(chain);
 		}
 
 		return (ENOSPC);
 	}
 
 	if (V_autoinc_step < 1)
 		V_autoinc_step = 1;
 	else if (V_autoinc_step > 1000)
 		V_autoinc_step = 1000;
 
 	/* FIXME: Handle count > 1 */
 	ci = rci;
 	krule = ci->krule;
 	rulenum = krule->rulenum;
 
 	/* find the insertion point, we will insert before */
 	insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE;
 	i = ipfw_find_rule(chain, insert_before, 0);
 	/* duplicate first part */
 	if (i > 0)
 		bcopy(chain->map, map, i * sizeof(struct ip_fw *));
 	map[i] = krule;
 	/* duplicate remaining part, we always have the default rule */
 	bcopy(chain->map + i, map + i + 1,
 		sizeof(struct ip_fw *) *(chain->n_rules - i));
 	if (rulenum == 0) {
 		/* Compute rule number and write it back */
 		rulenum = i > 0 ? map[i-1]->rulenum : 0;
 		if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
 			rulenum += V_autoinc_step;
 		krule->rulenum = rulenum;
 		/* Save number to userland rule */
 		pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff);
 		*pnum = rulenum;
 	}
 
 	krule->id = chain->id + 1;
 	update_skipto_cache(chain, map);
 	map = swap_map(chain, map, chain->n_rules + 1);
 	chain->static_len += RULEUSIZE0(krule);
 	IPFW_UH_WUNLOCK(chain);
 	if (map)
 		free(map, M_IPFW);
 	return (0);
 }
 
 /*
  * Adds @rule to the list of rules to reap
  */
 void
 ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head,
     struct ip_fw *rule)
 {
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	/* Unlink rule from everywhere */
 	unref_rule_objects(chain, rule);
 
 	*((struct ip_fw **)rule) = *head;
 	*head = rule;
 }
 
 /*
  * Reclaim storage associated with a list of rules.  This is
  * typically the list created using remove_rule.
  * A NULL pointer on input is handled correctly.
  */
 void
 ipfw_reap_rules(struct ip_fw *head)
 {
 	struct ip_fw *rule;
 
 	while ((rule = head) != NULL) {
 		head = *((struct ip_fw **)head);
 		free_rule(rule);
 	}
 }
 
 /*
  * Rules to keep are
  *	(default || reserved || !match_set || !match_number)
  * where
  *   default ::= (rule->rulenum == IPFW_DEFAULT_RULE)
  *	// the default rule is always protected
  *
  *   reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET)
  *	// RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush")
  *
  *   match_set ::= (cmd == 0 || rule->set == set)
  *	// set number is ignored for cmd == 0
  *
  *   match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum)
  *	// number is ignored for cmd == 1 or n == 0
  *
  */
 int
 ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt)
 {
 
 	/* Don't match default rule for modification queries */
 	if (rule->rulenum == IPFW_DEFAULT_RULE &&
 	    (rt->flags & IPFW_RCFLAG_DEFAULT) == 0)
 		return (0);
 
 	/* Don't match rules in reserved set for flush requests */
 	if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET)
 		return (0);
 
 	/* If we're filtering by set, don't match other sets */
 	if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set)
 		return (0);
 
 	if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 &&
 	    (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule))
 		return (0);
 
 	return (1);
 }
 
 struct manage_sets_args {
 	uint16_t	set;
 	uint8_t		new_set;
 };
 
 static int
 swap_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set == (uint8_t)args->set)
 		no->set = args->new_set;
 	else if (no->set == args->new_set)
 		no->set = (uint8_t)args->set;
 	return (0);
 }
 
 static int
 move_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set == (uint8_t)args->set)
 		no->set = args->new_set;
 	return (0);
 }
 
 static int
 test_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set != (uint8_t)args->set)
 		return (0);
 	if (ipfw_objhash_lookup_name_type(ni, args->new_set,
 	    no->etlv, no->name) != NULL)
 		return (EEXIST);
 	return (0);
 }
 
 /*
  * Generic function to handler moving and swapping sets.
  */
 int
 ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type,
     uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd)
 {
 	struct manage_sets_args args;
 	struct named_object *no;
 
 	args.set = set;
 	args.new_set = new_set;
 	switch (cmd) {
 	case SWAP_ALL:
 		return (ipfw_objhash_foreach_type(ni, swap_sets_cb,
 		    &args, type));
 	case TEST_ALL:
 		return (ipfw_objhash_foreach_type(ni, test_sets_cb,
 		    &args, type));
 	case MOVE_ALL:
 		return (ipfw_objhash_foreach_type(ni, move_sets_cb,
 		    &args, type));
 	case COUNT_ONE:
 		/*
 		 * @set used to pass kidx.
 		 * When @new_set is zero - reset object counter,
 		 * otherwise increment it.
 		 */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		if (new_set != 0)
 			no->ocnt++;
 		else
 			no->ocnt = 0;
 		return (0);
 	case TEST_ONE:
 		/* @set used to pass kidx */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		/*
 		 * First check number of references:
 		 * when it differs, this mean other rules are holding
 		 * reference to given object, so it is not possible to
 		 * change its set. Note that refcnt may account references
 		 * to some going-to-be-added rules. Since we don't know
 		 * their numbers (and even if they will be added) it is
 		 * perfectly OK to return error here.
 		 */
 		if (no->ocnt != no->refcnt)
 			return (EBUSY);
 		if (ipfw_objhash_lookup_name_type(ni, new_set, type,
 		    no->name) != NULL)
 			return (EEXIST);
 		return (0);
 	case MOVE_ONE:
 		/* @set used to pass kidx */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		no->set = new_set;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 /*
  * Delete rules matching range @rt.
  * Saves number of deleted rules in @ndel.
  *
  * Returns 0 on success.
  */
 static int
 delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel)
 {
 	struct ip_fw *reap, *rule, **map;
 	int end, start;
 	int i, n, ndyn, ofs;
 
 	reap = NULL;
 	IPFW_UH_WLOCK(chain);	/* arbitrate writers */
 
 	/*
 	 * Stage 1: Determine range to inspect.
 	 * Range is half-inclusive, e.g [start, end).
 	 */
 	start = 0;
 	end = chain->n_rules - 1;
 
 	if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) {
 		start = ipfw_find_rule(chain, rt->start_rule, 0);
 
 		end = ipfw_find_rule(chain, rt->end_rule, 0);
 		if (rt->end_rule != IPFW_DEFAULT_RULE)
 			while (chain->map[end]->rulenum == rt->end_rule)
 				end++;
 	}
 
 	/* Allocate new map of the same size */
 	map = get_map(chain, 0, 1 /* locked */);
 	if (map == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	n = 0;
 	ndyn = 0;
 	ofs = start;
 	/* 1. bcopy the initial part of the map */
 	if (start > 0)
 		bcopy(chain->map, map, start * sizeof(struct ip_fw *));
 	/* 2. copy active rules between start and end */
 	for (i = start; i < end; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0) {
 			map[ofs++] = rule;
 			continue;
 		}
 
 		n++;
 		if (ipfw_is_dyn_rule(rule) != 0)
 			ndyn++;
 	}
 	/* 3. copy the final part of the map */
 	bcopy(chain->map + end, map + ofs,
 		(chain->n_rules - end) * sizeof(struct ip_fw *));
 	/* 4. recalculate skipto cache */
 	update_skipto_cache(chain, map);
 	/* 5. swap the maps (under UH_WLOCK + WHLOCK) */
 	map = swap_map(chain, map, chain->n_rules - n);
 	/* 6. Remove all dynamic states originated by deleted rules */
 	if (ndyn > 0)
 		ipfw_expire_dyn_rules(chain, rt);
 	/* 7. now remove the rules deleted from the old map */
 	for (i = start; i < end; i++) {
 		rule = map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		chain->static_len -= RULEUSIZE0(rule);
 		ipfw_reap_add(chain, &reap, rule);
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	ipfw_reap_rules(reap);
 	if (map != NULL)
 		free(map, M_IPFW);
 	*ndel = n;
 	return (0);
 }
 
 static int
 move_objects(struct ip_fw_chain *ch, ipfw_range_tlv *rt)
 {
 	struct opcode_obj_rewrite *rw;
 	struct ip_fw *rule;
 	ipfw_insn *cmd;
 	int cmdlen, i, l, c;
 	uint16_t kidx;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	/* Stage 1: count number of references by given rules */
 	for (c = 0, i = 0; i < ch->n_rules - 1; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/*
 			 * When manage_sets() returns non-zero value to
 			 * COUNT_ONE command, consider this as an object
 			 * doesn't support sets (e.g. disabled with sysctl).
 			 * So, skip checks for this object.
 			 */
 			if (rw->manage_sets(ch, kidx, 1, COUNT_ONE) != 0)
 				continue;
 			c++;
 		}
 	}
 	if (c == 0) /* No objects found */
 		return (0);
 	/* Stage 2: verify "ownership" */
 	for (c = 0, i = 0; (i < ch->n_rules - 1) && c == 0; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0 && c == 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/* Test for ownership and conflicting names */
 			c = rw->manage_sets(ch, kidx,
 			    (uint8_t)rt->new_set, TEST_ONE);
 		}
 	}
 	/* Stage 3: change set and cleanup */
 	for (i = 0; i < ch->n_rules - 1; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/* cleanup object counter */
 			rw->manage_sets(ch, kidx,
 			    0 /* reset counter */, COUNT_ONE);
 			if (c != 0)
 				continue;
 			/* change set */
 			rw->manage_sets(ch, kidx,
 			    (uint8_t)rt->new_set, MOVE_ONE);
 		}
 	}
 	return (c);
 }/*
  * Changes set of given rule rannge @rt
  * with each other.
  *
  * Returns 0 on success.
  */
 static int
 move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
 {
 	struct ip_fw *rule;
 	int i;
 
 	IPFW_UH_WLOCK(chain);
 
 	/*
 	 * Move rules with matching paramenerts to a new set.
 	 * This one is much more complex. We have to ensure
 	 * that all referenced tables (if any) are referenced
 	 * by given rule subset only. Otherwise, we can't move
 	 * them to new set and have to return error.
 	 */
 	if ((i = move_objects(chain, rt)) != 0) {
 		IPFW_UH_WUNLOCK(chain);
 		return (i);
 	}
 
 	/* XXX: We have to do swap holding WLOCK */
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		rule->set = rt->new_set;
 	}
 
 	IPFW_UH_WUNLOCK(chain);
 
 	return (0);
 }
 
 /*
  * Clear counters for a specific rule.
  * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
  * so we only care that rules do not disappear.
  */
 static void
 clear_counters(struct ip_fw *rule, int log_only)
 {
 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
 
 	if (log_only == 0)
 		IPFW_ZERO_RULE_COUNTER(rule);
 	if (l->o.opcode == O_LOG)
 		l->log_left = l->max_log;
 }
 
 /*
  * Flushes rules counters and/or log values on matching range.
  *
  * Returns number of items cleared.
  */
 static int
 clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only)
 {
 	struct ip_fw *rule;
 	int num;
 	int i;
 
 	num = 0;
 	rt->flags |= IPFW_RCFLAG_DEFAULT;
 
 	IPFW_UH_WLOCK(chain);	/* arbitrate writers */
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		clear_counters(rule, log_only);
 		num++;
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	return (num);
 }
 
 static int
 check_range_tlv(ipfw_range_tlv *rt)
 {
 
 	if (rt->head.length != sizeof(*rt))
 		return (1);
 	if (rt->start_rule > rt->end_rule)
 		return (1);
 	if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS)
 		return (1);
 
 	if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Delete rules matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  * Reply: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Saves number of deleted rules in ipfw_range_tlv->new_set.
  *
  * Returns 0 on success.
  */
 static int
 del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int error, ndel;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	ndel = 0;
 	if ((error = delete_range(chain, &rh->range, &ndel)) != 0)
 		return (error);
 
 	/* Save number of rules deleted */
 	rh->range.new_set = ndel;
 	return (0);
 }
 
 /*
  * Move rules/sets matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Returns 0 on success.
  */
 static int
 move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	return (move_range(chain, &rh->range));
 }
 
 /*
  * Clear rule accounting data matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  * Reply: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Saves number of cleared rules in ipfw_range_tlv->new_set.
  *
  * Returns 0 on success.
  */
 static int
 clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int log_only, num;
 	char *msg;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	log_only = (op3->opcode == IP_FW_XRESETLOG);
 
 	num = clear_range(chain, &rh->range, log_only);
 
 	if (rh->range.flags & IPFW_RCFLAG_ALL)
 		msg = log_only ? "All logging counts reset" :
 		    "Accounting cleared";
 	else
 		msg = log_only ? "logging count reset" : "cleared";
 
 	if (V_fw_verbose) {
 		int lev = LOG_SECURITY | LOG_NOTICE;
 		log(lev, "ipfw: %s.\n", msg);
 	}
 
 	/* Save number of rules cleared */
 	rh->range.new_set = num;
 	return (0);
 }
 
 static void
 enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
 {
 	uint32_t v_set;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	/* Change enabled/disabled sets mask */
 	v_set = (V_set_disable | rt->set) & ~rt->new_set;
 	v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */
 	IPFW_WLOCK(chain);
 	V_set_disable = v_set;
 	IPFW_WUNLOCK(chain);
 }
 
 static int
 swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv)
 {
 	struct opcode_obj_rewrite *rw;
 	struct ip_fw *rule;
 	int i;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	if (rt->set == rt->new_set) /* nothing to do */
 		return (0);
 
 	if (mv != 0) {
 		/*
 		 * Berfore moving the rules we need to check that
 		 * there aren't any conflicting named objects.
 		 */
 		for (rw = ctl3_rewriters;
 		    rw < ctl3_rewriters + ctl3_rsize; rw++) {
 			if (rw->manage_sets == NULL)
 				continue;
 			i = rw->manage_sets(chain, (uint8_t)rt->set,
 			    (uint8_t)rt->new_set, TEST_ALL);
 			if (i != 0)
 				return (EEXIST);
 		}
 	}
 	/* Swap or move two sets */
 	for (i = 0; i < chain->n_rules - 1; i++) {
 		rule = chain->map[i];
 		if (rule->set == (uint8_t)rt->set)
 			rule->set = (uint8_t)rt->new_set;
 		else if (rule->set == (uint8_t)rt->new_set && mv == 0)
 			rule->set = (uint8_t)rt->set;
 	}
 	for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) {
 		if (rw->manage_sets == NULL)
 			continue;
 		rw->manage_sets(chain, (uint8_t)rt->set,
 		    (uint8_t)rt->new_set, mv != 0 ? MOVE_ALL: SWAP_ALL);
 	}
 	return (0);
 }
 
 /*
  * Swaps or moves set
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Returns 0 on success.
  */
 static int
 manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int ret;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (rh->range.head.length != sizeof(ipfw_range_tlv))
 		return (1);
 	/* enable_sets() expects bitmasks. */
 	if (op3->opcode != IP_FW_SET_ENABLE &&
 	    (rh->range.set >= IPFW_MAX_SETS ||
 	    rh->range.new_set >= IPFW_MAX_SETS))
 		return (EINVAL);
 
 	ret = 0;
 	IPFW_UH_WLOCK(chain);
 	switch (op3->opcode) {
 	case IP_FW_SET_SWAP:
 	case IP_FW_SET_MOVE:
 		ret = swap_sets(chain, &rh->range,
 		    op3->opcode == IP_FW_SET_MOVE);
 		break;
 	case IP_FW_SET_ENABLE:
 		enable_sets(chain, &rh->range);
 		break;
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	return (ret);
 }
 
 /**
  * Remove all rules with given number, or do set manipulation.
  * Assumes chain != NULL && *chain != NULL.
  *
  * The argument is an uint32_t. The low 16 bit are the rule or set number;
  * the next 8 bits are the new set; the top 8 bits indicate the command:
  *
  *	0	delete rules numbered "rulenum"
  *	1	delete rules in set "rulenum"
  *	2	move rules "rulenum" to set "new_set"
  *	3	move rules from set "rulenum" to set "new_set"
  *	4	swap sets "rulenum" and "new_set"
  *	5	delete rules "rulenum" and set "new_set"
  */
 static int
 del_entry(struct ip_fw_chain *chain, uint32_t arg)
 {
 	uint32_t num;	/* rule number or old_set */
 	uint8_t cmd, new_set;
 	int do_del, ndel;
 	int error = 0;
 	ipfw_range_tlv rt;
 
 	num = arg & 0xffff;
 	cmd = (arg >> 24) & 0xff;
 	new_set = (arg >> 16) & 0xff;
 
 	if (cmd > 5 || new_set > RESVD_SET)
 		return EINVAL;
 	if (cmd == 0 || cmd == 2 || cmd == 5) {
 		if (num >= IPFW_DEFAULT_RULE)
 			return EINVAL;
 	} else {
 		if (num > RESVD_SET)	/* old_set */
 			return EINVAL;
 	}
 
 	/* Convert old requests into new representation */
 	memset(&rt, 0, sizeof(rt));
 	rt.start_rule = num;
 	rt.end_rule = num;
 	rt.set = num;
 	rt.new_set = new_set;
 	do_del = 0;
 
 	switch (cmd) {
 	case 0: /* delete rules numbered "rulenum" */
 		if (num == 0)
 			rt.flags |= IPFW_RCFLAG_ALL;
 		else
 			rt.flags |= IPFW_RCFLAG_RANGE;
 		do_del = 1;
 		break;
 	case 1: /* delete rules in set "rulenum" */
 		rt.flags |= IPFW_RCFLAG_SET;
 		do_del = 1;
 		break;
 	case 5: /* delete rules "rulenum" and set "new_set" */
 		rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET;
 		rt.set = new_set;
 		rt.new_set = 0;
 		do_del = 1;
 		break;
 	case 2: /* move rules "rulenum" to set "new_set" */
 		rt.flags |= IPFW_RCFLAG_RANGE;
 		break;
 	case 3: /* move rules from set "rulenum" to set "new_set" */
 		IPFW_UH_WLOCK(chain);
 		error = swap_sets(chain, &rt, 1);
 		IPFW_UH_WUNLOCK(chain);
 		return (error);
 	case 4: /* swap sets "rulenum" and "new_set" */
 		IPFW_UH_WLOCK(chain);
 		error = swap_sets(chain, &rt, 0);
 		IPFW_UH_WUNLOCK(chain);
 		return (error);
 	default:
 		return (ENOTSUP);
 	}
 
 	if (do_del != 0) {
 		if ((error = delete_range(chain, &rt, &ndel)) != 0)
 			return (error);
 
 		if (ndel == 0 && (cmd != 1 && num != 0))
 			return (EINVAL);
 
 		return (0);
 	}
 
 	return (move_range(chain, &rt));
 }
 
 /**
  * Reset some or all counters on firewall rules.
  * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
  * the next 8 bits are the set number, the top 8 bits are the command:
  *	0	work with rules from all set's;
  *	1	work with rules only from specified set.
  * Specified rule number is zero if we want to clear all entries.
  * log_only is 1 if we only want to reset logs, zero otherwise.
  */
 static int
 zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 {
 	struct ip_fw *rule;
 	char *msg;
 	int i;
 
 	uint16_t rulenum = arg & 0xffff;
 	uint8_t set = (arg >> 16) & 0xff;
 	uint8_t cmd = (arg >> 24) & 0xff;
 
 	if (cmd > 1)
 		return (EINVAL);
 	if (cmd == 1 && set > RESVD_SET)
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	if (rulenum == 0) {
 		V_norule_counter = 0;
 		for (i = 0; i < chain->n_rules; i++) {
 			rule = chain->map[i];
 			/* Skip rules not in our set. */
 			if (cmd == 1 && rule->set != set)
 				continue;
 			clear_counters(rule, log_only);
 		}
 		msg = log_only ? "All logging counts reset" :
 		    "Accounting cleared";
 	} else {
 		int cleared = 0;
 		for (i = 0; i < chain->n_rules; i++) {
 			rule = chain->map[i];
 			if (rule->rulenum == rulenum) {
 				if (cmd == 0 || rule->set == set)
 					clear_counters(rule, log_only);
 				cleared = 1;
 			}
 			if (rule->rulenum > rulenum)
 				break;
 		}
 		if (!cleared) {	/* we did not find any matching rules */
 			IPFW_UH_RUNLOCK(chain);
 			return (EINVAL);
 		}
 		msg = log_only ? "logging count reset" : "cleared";
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	if (V_fw_verbose) {
 		int lev = LOG_SECURITY | LOG_NOTICE;
 
 		if (rulenum)
 			log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
 		else
 			log(lev, "ipfw: %s.\n", msg);
 	}
 	return (0);
 }
 
 
 /*
  * Check rule head in FreeBSD11 format
  *
  */
 static int
 check_ipfw_rule1(struct ip_fw_rule *rule, int size,
     struct rule_check_info *ci)
 {
 	int l;
 
 	if (size < sizeof(*rule)) {
 		printf("ipfw: rule too short\n");
 		return (EINVAL);
 	}
 
 	/* Check for valid cmd_len */
 	l = roundup2(RULESIZE(rule), sizeof(uint64_t));
 	if (l != size) {
 		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
 		return (EINVAL);
 	}
 	if (rule->act_ofs >= rule->cmd_len) {
 		printf("ipfw: bogus action offset (%u > %u)\n",
 		    rule->act_ofs, rule->cmd_len - 1);
 		return (EINVAL);
 	}
 
 	if (rule->rulenum > IPFW_DEFAULT_RULE - 1)
 		return (EINVAL);
 
 	return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci));
 }
 
 /*
  * Check rule head in FreeBSD8 format
  *
  */
 static int
 check_ipfw_rule0(struct ip_fw_rule0 *rule, int size,
     struct rule_check_info *ci)
 {
 	int l;
 
 	if (size < sizeof(*rule)) {
 		printf("ipfw: rule too short\n");
 		return (EINVAL);
 	}
 
 	/* Check for valid cmd_len */
 	l = sizeof(*rule) + rule->cmd_len * 4 - 4;
 	if (l != size) {
 		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
 		return (EINVAL);
 	}
 	if (rule->act_ofs >= rule->cmd_len) {
 		printf("ipfw: bogus action offset (%u > %u)\n",
 		    rule->act_ofs, rule->cmd_len - 1);
 		return (EINVAL);
 	}
 
 	if (rule->rulenum > IPFW_DEFAULT_RULE - 1)
 		return (EINVAL);
 
 	return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci));
 }
 
 static int
 check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci)
 {
 	int cmdlen, l;
 	int have_action;
 
 	have_action = 0;
 
 	/*
 	 * Now go for the individual checks. Very simple ones, basically only
 	 * instruction sizes.
 	 */
 	for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 		if (cmdlen > l) {
 			printf("ipfw: opcode %d size truncated\n",
 			    cmd->opcode);
 			return EINVAL;
 		}
 		switch (cmd->opcode) {
 		case O_PROBE_STATE:
 		case O_KEEP_STATE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_PROTO:
 		case O_IP_SRC_ME:
 		case O_IP_DST_ME:
 		case O_LAYER2:
 		case O_IN:
 		case O_FRAG:
 		case O_DIVERTED:
 		case O_IPOPT:
 		case O_IPTOS:
 		case O_IPPRECEDENCE:
 		case O_IPVER:
 		case O_SOCKARG:
 		case O_TCPFLAGS:
 		case O_TCPOPTS:
 		case O_ESTAB:
 		case O_VERREVPATH:
 		case O_VERSRCREACH:
 		case O_ANTISPOOF:
 		case O_IPSEC:
 #ifdef INET6
 		case O_IP6_SRC_ME:
 		case O_IP6_DST_ME:
 		case O_EXT_HDR:
 		case O_IP6:
 #endif
 		case O_IP4:
 		case O_TAG:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_EXTERNAL_ACTION:
 			if (cmd->arg1 == 0 ||
 			    cmdlen != F_INSN_SIZE(ipfw_insn)) {
 				printf("ipfw: invalid external "
 				    "action opcode\n");
 				return (EINVAL);
 			}
 			ci->object_opcodes++;
 			/* Do we have O_EXTERNAL_INSTANCE opcode? */
 			if (l != cmdlen) {
 				l -= cmdlen;
 				cmd += cmdlen;
 				cmdlen = F_LEN(cmd);
 				if (cmd->opcode != O_EXTERNAL_INSTANCE) {
 					printf("ipfw: invalid opcode "
 					    "next to external action %u\n",
 					    cmd->opcode);
 					return (EINVAL);
 				}
 				if (cmd->arg1 == 0 ||
 				    cmdlen != F_INSN_SIZE(ipfw_insn)) {
 					printf("ipfw: invalid external "
 					    "action instance opcode\n");
 					return (EINVAL);
 				}
 				ci->object_opcodes++;
 			}
 			goto check_action;
 
 		case O_FIB:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			if (cmd->arg1 >= rt_numfibs) {
 				printf("ipfw: invalid fib number %d\n",
 					cmd->arg1);
 				return EINVAL;
 			}
 			break;
 
 		case O_SETFIB:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			if ((cmd->arg1 != IP_FW_TARG) &&
 			    ((cmd->arg1 & 0x7FFF) >= rt_numfibs)) {
 				printf("ipfw: invalid fib number %d\n",
 					cmd->arg1 & 0x7FFF);
 				return EINVAL;
 			}
 			goto check_action;
 
 		case O_UID:
 		case O_GID:
 		case O_JAIL:
 		case O_IP_SRC:
 		case O_IP_DST:
 		case O_TCPSEQ:
 		case O_TCPACK:
 		case O_PROB:
 		case O_ICMPTYPE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			break;
 
 		case O_LIMIT:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 
 		case O_LOG:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
 				goto bad_size;
 
 			((ipfw_insn_log *)cmd)->log_left =
 			    ((ipfw_insn_log *)cmd)->max_log;
 
 			break;
 
 		case O_IP_SRC_MASK:
 		case O_IP_DST_MASK:
 			/* only odd command lengths */
 			if ((cmdlen & 1) == 0)
 				goto bad_size;
 			break;
 
 		case O_IP_SRC_SET:
 		case O_IP_DST_SET:
 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
 				printf("ipfw: invalid set size %d\n",
 					cmd->arg1);
 				return EINVAL;
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    (cmd->arg1+31)/32 )
 				goto bad_size;
 			break;
 
 		case O_IP_SRC_LOOKUP:
 		case O_IP_DST_LOOKUP:
 			if (cmd->arg1 >= V_fw_tables_max) {
 				printf("ipfw: invalid table number %d\n",
 				    cmd->arg1);
 				return (EINVAL);
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_IP_FLOW_LOOKUP:
 			if (cmd->arg1 >= V_fw_tables_max) {
 				printf("ipfw: invalid table number %d\n",
 				    cmd->arg1);
 				return (EINVAL);
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_MACADDR2:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
 				goto bad_size;
 			break;
 
 		case O_NOP:
 		case O_IPID:
 		case O_IPTTL:
 		case O_IPLEN:
 		case O_TCPDATALEN:
 		case O_TCPWIN:
 		case O_TAGGED:
 			if (cmdlen < 1 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_DSCP:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1)
 				goto bad_size;
 			break;
 
 		case O_MAC_TYPE:
 		case O_IP_SRCPORT:
 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
 			if (cmdlen < 2 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_RECV:
 		case O_XMIT:
 		case O_VIA:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 
 		case O_ALTQ:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
 				goto bad_size;
 			break;
 
 		case O_PIPE:
 		case O_QUEUE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			goto check_action;
 
 		case O_FORWARD_IP:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
 				goto bad_size;
 			goto check_action;
 #ifdef INET6
 		case O_FORWARD_IP6:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6))
 				goto bad_size;
 			goto check_action;
 #endif /* INET6 */
 
 		case O_DIVERT:
 		case O_TEE:
 			if (ip_divert_ptr == NULL)
 				return EINVAL;
 			else
 				goto check_size;
 		case O_NETGRAPH:
 		case O_NGTEE:
 			if (ng_ipfw_input_p == NULL)
 				return EINVAL;
 			else
 				goto check_size;
 		case O_NAT:
 			if (!IPFW_NAT_LOADED)
 				return EINVAL;
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
  				goto bad_size;		
  			goto check_action;
 		case O_CHECK_STATE:
 			ci->object_opcodes++;
 			/* FALLTHROUGH */
 		case O_FORWARD_MAC: /* XXX not implemented yet */
 		case O_COUNT:
 		case O_ACCEPT:
 		case O_DENY:
 		case O_REJECT:
 		case O_SETDSCP:
 #ifdef INET6
 		case O_UNREACH6:
 #endif
 		case O_SKIPTO:
 		case O_REASS:
 		case O_CALLRETURN:
 check_size:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 check_action:
 			if (have_action) {
 				printf("ipfw: opcode %d, multiple actions"
 					" not allowed\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 			have_action = 1;
 			if (l != cmdlen) {
 				printf("ipfw: opcode %d, action must be"
 					" last opcode\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 			break;
 #ifdef INET6
 		case O_IP6_SRC:
 		case O_IP6_DST:
 			if (cmdlen != F_INSN_SIZE(struct in6_addr) +
 			    F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_FLOW6ID:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    ((ipfw_insn_u32 *)cmd)->o.arg1)
 				goto bad_size;
 			break;
 
 		case O_IP6_SRC_MASK:
 		case O_IP6_DST_MASK:
 			if ( !(cmdlen & 1) || cmdlen > 127)
 				goto bad_size;
 			break;
 		case O_ICMP6TYPE:
 			if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
 				goto bad_size;
 			break;
 #endif
 
 		default:
 			switch (cmd->opcode) {
 #ifndef INET6
 			case O_IP6_SRC_ME:
 			case O_IP6_DST_ME:
 			case O_EXT_HDR:
 			case O_IP6:
 			case O_UNREACH6:
 			case O_IP6_SRC:
 			case O_IP6_DST:
 			case O_FLOW6ID:
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 			case O_ICMP6TYPE:
 				printf("ipfw: no IPv6 support in kernel\n");
 				return (EPROTONOSUPPORT);
 #endif
 			default:
 				printf("ipfw: opcode %d, unknown opcode\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 		}
 	}
 	if (have_action == 0) {
 		printf("ipfw: missing action\n");
 		return (EINVAL);
 	}
 	return 0;
 
 bad_size:
 	printf("ipfw: opcode %d size %d wrong\n",
 		cmd->opcode, cmdlen);
 	return (EINVAL);
 }
 
 
 /*
  * Translation of requests for compatibility with FreeBSD 7.2/8.
  * a static variable tells us if we have an old client from userland,
  * and if necessary we translate requests and responses between the
  * two formats.
  */
 static int is7 = 0;
 
 struct ip_fw7 {
 	struct ip_fw7	*next;		/* linked list of rules     */
 	struct ip_fw7	*next_rule;	/* ptr to next [skipto] rule    */
 	/* 'next_rule' is used to pass up 'set_disable' status      */
 
 	uint16_t	act_ofs;	/* offset of action in 32-bit units */
 	uint16_t	cmd_len;	/* # of 32-bit words in cmd */
 	uint16_t	rulenum;	/* rule number          */
 	uint8_t		set;		/* rule set (0..31)     */
 	// #define RESVD_SET   31  /* set for default and persistent rules */
 	uint8_t		_pad;		/* padding          */
 	// uint32_t        id;             /* rule id, only in v.8 */
 	/* These fields are present in all rules.           */
 	uint64_t	pcnt;		/* Packet counter       */
 	uint64_t	bcnt;		/* Byte counter         */
 	uint32_t	timestamp;	/* tv_sec of last match     */
 
 	ipfw_insn	cmd[1];		/* storage for commands     */
 };
 
 static int convert_rule_to_7(struct ip_fw_rule0 *rule);
 static int convert_rule_to_8(struct ip_fw_rule0 *rule);
 
 #ifndef RULESIZE7
 #define RULESIZE7(rule)  (sizeof(struct ip_fw7) + \
 	((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
 #endif
 
 
 /*
  * Copy the static and dynamic rules to the supplied buffer
  * and return the amount of space actually used.
  * Must be run under IPFW_UH_RLOCK
  */
 static size_t
 ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
 {
 	char *bp = buf;
 	char *ep = bp + space;
 	struct ip_fw *rule;
 	struct ip_fw_rule0 *dst;
 	struct timeval boottime;
 	int error, i, l, warnflag;
 	time_t	boot_seconds;
 
 	warnflag = 0;
 
 	getboottime(&boottime);
         boot_seconds = boottime.tv_sec;
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 
 		if (is7) {
 		    /* Convert rule to FreeBSd 7.2 format */
 		    l = RULESIZE7(rule);
 		    if (bp + l + sizeof(uint32_t) <= ep) {
 			bcopy(rule, bp, l + sizeof(uint32_t));
 			error = set_legacy_obj_kidx(chain,
 			    (struct ip_fw_rule0 *)bp);
 			if (error != 0)
 				return (0);
 			error = convert_rule_to_7((struct ip_fw_rule0 *) bp);
 			if (error)
 				return 0; /*XXX correct? */
 			/*
 			 * XXX HACK. Store the disable mask in the "next"
 			 * pointer in a wild attempt to keep the ABI the same.
 			 * Why do we do this on EVERY rule?
 			 */
 			bcopy(&V_set_disable,
 				&(((struct ip_fw7 *)bp)->next_rule),
 				sizeof(V_set_disable));
 			if (((struct ip_fw7 *)bp)->timestamp)
 			    ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
 			bp += l;
 		    }
 		    continue; /* go to next rule */
 		}
 
 		l = RULEUSIZE0(rule);
 		if (bp + l > ep) { /* should not happen */
 			printf("overflow dumping static rules\n");
 			break;
 		}
 		dst = (struct ip_fw_rule0 *)bp;
 		export_rule0(rule, dst, l);
 		error = set_legacy_obj_kidx(chain, dst);
 
 		/*
 		 * XXX HACK. Store the disable mask in the "next"
 		 * pointer in a wild attempt to keep the ABI the same.
 		 * Why do we do this on EVERY rule?
 		 *
 		 * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask
 		 * so we need to fail _after_ saving at least one mask.
 		 */
 		bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
 		if (dst->timestamp)
 			dst->timestamp += boot_seconds;
 		bp += l;
 
 		if (error != 0) {
 			if (error == 2) {
 				/* Non-fatal table rewrite error. */
 				warnflag = 1;
 				continue;
 			}
 			printf("Stop on rule %d. Fail to convert table\n",
 			    rule->rulenum);
 			break;
 		}
 	}
 	if (warnflag != 0)
 		printf("ipfw: process %s is using legacy interfaces,"
 		    " consider rebuilding\n", "");
 	ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */
 	return (bp - (char *)buf);
 }
 
 
 struct dump_args {
 	uint32_t	b;	/* start rule */
 	uint32_t	e;	/* end rule */
 	uint32_t	rcount;	/* number of rules */
 	uint32_t	rsize;	/* rules size */
 	uint32_t	tcount;	/* number of tables */
 	int		rcounters;	/* counters */
 };
 
 void
 ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv)
 {
 
 	ntlv->head.type = no->etlv;
 	ntlv->head.length = sizeof(*ntlv);
 	ntlv->idx = no->kidx;
 	strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
 }
 
 /*
  * Export named object info in instance @ni, identified by @kidx
  * to ipfw_obj_ntlv. TLV is allocated from @sd space.
  *
  * Returns 0 on success.
  */
 static int
 export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx,
     struct sockopt_data *sd)
 {
 	struct named_object *no;
 	ipfw_obj_ntlv *ntlv;
 
 	no = ipfw_objhash_lookup_kidx(ni, kidx);
 	KASSERT(no != NULL, ("invalid object kernel index passed"));
 
 	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
 	if (ntlv == NULL)
 		return (ENOMEM);
 
 	ipfw_export_obj_ntlv(no, ntlv);
 	return (0);
 }
 
 /*
  * Dumps static rules with table TLVs in buffer @sd.
  *
  * Returns 0 on success.
  */
 static int
 dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da,
     uint32_t *bmask, struct sockopt_data *sd)
 {
 	int error;
 	int i, l;
 	uint32_t tcount;
 	ipfw_obj_ctlv *ctlv;
 	struct ip_fw *krule;
 	struct namedobj_instance *ni;
 	caddr_t dst;
 
 	/* Dump table names first (if any) */
 	if (da->tcount > 0) {
 		/* Header first */
 		ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
 		if (ctlv == NULL)
 			return (ENOMEM);
 		ctlv->head.type = IPFW_TLV_TBLNAME_LIST;
 		ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) + 
 		    sizeof(*ctlv);
 		ctlv->count = da->tcount;
 		ctlv->objsize = sizeof(ipfw_obj_ntlv);
 	}
 
 	i = 0;
 	tcount = da->tcount;
 	ni = ipfw_get_table_objhash(chain);
 	while (tcount > 0) {
 		if ((bmask[i / 32] & (1 << (i % 32))) == 0) {
 			i++;
 			continue;
 		}
 
 		/* Jump to shared named object bitmask */
 		if (i >= IPFW_TABLES_MAX) {
 			ni = CHAIN_TO_SRV(chain);
 			i -= IPFW_TABLES_MAX;
 			bmask += IPFW_TABLES_MAX / 32;
 		}
 
 		if ((error = export_objhash_ntlv(ni, i, sd)) != 0)
 			return (error);
 
 		i++;
 		tcount--;
 	}
 
 	/* Dump rules */
 	ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
 	if (ctlv == NULL)
 		return (ENOMEM);
 	ctlv->head.type = IPFW_TLV_RULE_LIST;
 	ctlv->head.length = da->rsize + sizeof(*ctlv);
 	ctlv->count = da->rcount;
 
 	for (i = da->b; i < da->e; i++) {
 		krule = chain->map[i];
 
 		l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv);
 		if (da->rcounters != 0)
 			l += sizeof(struct ip_fw_bcounter);
 		dst = (caddr_t)ipfw_get_sopt_space(sd, l);
 		if (dst == NULL)
 			return (ENOMEM);
 
 		export_rule1(krule, dst, l, da->rcounters);
 	}
 
 	return (0);
 }
 
 /*
  * Marks every object index used in @rule with bit in @bmask.
  * Used to generate bitmask of referenced tables/objects for given ruleset
  * or its part.
  *
  * Returns number of newly-referenced objects.
  */
 static int
 mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule,
     uint32_t *bmask)
 {
 	struct opcode_obj_rewrite *rw;
 	ipfw_insn *cmd;
 	int bidx, cmdlen, l, count;
 	uint16_t kidx;
 	uint8_t subtype;
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	count = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 
 		bidx = kidx / 32;
 		/*
 		 * Maintain separate bitmasks for table and
 		 * non-table objects.
 		 */
 		if (rw->etlv != IPFW_TLV_TBL_NAME)
 			bidx += IPFW_TABLES_MAX / 32;
 
 		if ((bmask[bidx] & (1 << (kidx % 32))) == 0)
 			count++;
 
 		bmask[bidx] |= 1 << (kidx % 32);
 	}
 
 	return (count);
 }
 
 /*
  * Dumps requested objects data
  * Data layout (version 0)(current):
  * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags
  *   size = ipfw_cfg_lheader.size
  * Reply: [ ipfw_cfg_lheader 
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST)
  *     ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ]
  *   ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional)
  * ]
  * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize.
  * The rest (size, count) are set to zero and needs to be ignored.
  *
  * Returns 0 on success.
  */
 static int
 dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_cfg_lheader *hdr;
 	struct ip_fw *rule;
 	size_t sz, rnum;
 	uint32_t hdr_flags;
 	int error, i;
 	struct dump_args da;
 	uint32_t *bmask;
 
 	hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr));
 	if (hdr == NULL)
 		return (EINVAL);
 
 	error = 0;
 	bmask = NULL;
 	/* Allocate needed state. Note we allocate 2xspace mask, for table&srv  */
 	if (hdr->flags & IPFW_CFG_GET_STATIC)
 		bmask = malloc(IPFW_TABLES_MAX / 4, M_TEMP, M_WAITOK | M_ZERO);
 
 	IPFW_UH_RLOCK(chain);
 
 	/*
 	 * STAGE 1: Determine size/count for objects in range.
 	 * Prepare used tables bitmask.
 	 */
 	sz = sizeof(ipfw_cfg_lheader);
 	memset(&da, 0, sizeof(da));
 
 	da.b = 0;
 	da.e = chain->n_rules;
 
 	if (hdr->end_rule != 0) {
 		/* Handle custom range */
 		if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE)
 			rnum = IPFW_DEFAULT_RULE;
 		da.b = ipfw_find_rule(chain, rnum, 0);
 		rnum = hdr->end_rule;
 		rnum = (rnum < IPFW_DEFAULT_RULE) ? rnum+1 : IPFW_DEFAULT_RULE;
 		da.e = ipfw_find_rule(chain, rnum, 0) + 1;
 	}
 
 	if (hdr->flags & IPFW_CFG_GET_STATIC) {
 		for (i = da.b; i < da.e; i++) {
 			rule = chain->map[i];
 			da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv);
 			da.rcount++;
 			/* Update bitmask of used objects for given range */
 			da.tcount += mark_object_kidx(chain, rule, bmask);
 		}
 		/* Add counters if requested */
 		if (hdr->flags & IPFW_CFG_GET_COUNTERS) {
 			da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount;
 			da.rcounters = 1;
 		}
 
 		if (da.tcount > 0)
 			sz += da.tcount * sizeof(ipfw_obj_ntlv) +
 			    sizeof(ipfw_obj_ctlv);
 		sz += da.rsize + sizeof(ipfw_obj_ctlv);
 	}
 
 	if (hdr->flags & IPFW_CFG_GET_STATES)
 		sz += ipfw_dyn_get_count() * sizeof(ipfw_obj_dyntlv) +
 		     sizeof(ipfw_obj_ctlv);
 
 
 	/*
 	 * Fill header anyway.
 	 * Note we have to save header fields to stable storage
 	 * buffer inside @sd can be flushed after dumping rules
 	 */
 	hdr->size = sz;
 	hdr->set_mask = ~V_set_disable;
 	hdr_flags = hdr->flags;
 	hdr = NULL;
 
 	if (sd->valsize < sz) {
 		error = ENOMEM;
 		goto cleanup;
 	}
 
 	/* STAGE2: Store actual data */
 	if (hdr_flags & IPFW_CFG_GET_STATIC) {
 		error = dump_static_rules(chain, &da, bmask, sd);
 		if (error != 0)
 			goto cleanup;
 	}
 
 	if (hdr_flags & IPFW_CFG_GET_STATES)
 		error = ipfw_dump_states(chain, sd);
 
 cleanup:
 	IPFW_UH_RUNLOCK(chain);
 
 	if (bmask != NULL)
 		free(bmask, M_TEMP);
 
 	return (error);
 }
 
 int
 ipfw_check_object_name_generic(const char *name)
 {
 	int nsize;
 
 	nsize = sizeof(((ipfw_obj_ntlv *)0)->name);
 	if (strnlen(name, nsize) == nsize)
 		return (EINVAL);
 	if (name[0] == '\0')
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Creates non-existent objects referenced by rule.
  *
  * Return 0 on success.
  */
 int
 create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti)
 {
 	struct opcode_obj_rewrite *rw;
 	struct obj_idx *p;
 	uint16_t kidx;
 	int error;
 
 	/*
 	 * Compatibility stuff: do actual creation for non-existing,
 	 * but referenced objects.
 	 */
 	for (p = oib; p < pidx; p++) {
 		if (p->kidx != 0)
 			continue;
 
 		ti->uidx = p->uidx;
 		ti->type = p->type;
 		ti->atype = 0;
 
 		rw = find_op_rw(cmd + p->off, NULL, NULL);
 		KASSERT(rw != NULL, ("Unable to find handler for op %d",
 		    (cmd + p->off)->opcode));
 
 		if (rw->create_object == NULL)
 			error = EOPNOTSUPP;
 		else
 			error = rw->create_object(ch, ti, &kidx);
 		if (error == 0) {
 			p->kidx = kidx;
 			continue;
 		}
 
 		/*
 		 * Error happened. We have to rollback everything.
 		 * Drop all already acquired references.
 		 */
 		IPFW_UH_WLOCK(ch);
 		unref_oib_objects(ch, cmd, oib, pidx);
 		IPFW_UH_WUNLOCK(ch);
 
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Compatibility function for old ipfw(8) binaries.
  * Rewrites table/nat kernel indices with userland ones.
  * Convert tables matching '/^\d+$/' to their atoi() value.
  * Use number 65535 for other tables.
  *
  * Returns 0 on success.
  */
 static int
 set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	ipfw_insn *cmd;
 	char *end;
 	long val;
 	int cmdlen, error, l;
 	uint16_t kidx, uidx;
 	uint8_t subtype;
 
 	error = 0;
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		/* Check if is index in given opcode */
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 
 		/* Try to find referenced kernel object */
 		no = rw->find_bykidx(ch, kidx);
 		if (no == NULL)
 			continue;
 
 		val = strtol(no->name, &end, 10);
 		if (*end == '\0' && val < 65535) {
 			uidx = val;
 		} else {
 
 			/*
 			 * We are called via legacy opcode.
 			 * Save error and show table as fake number
 			 * not to make ipfw(8) hang.
 			 */
 			uidx = 65535;
 			error = 2;
 		}
 
 		rw->update(cmd, uidx);
 	}
 
 	return (error);
 }
 
 
 /*
  * Unreferences all already-referenced objects in given @cmd rule,
  * using information in @oib.
  *
  * Used to rollback partially converted rule on error.
  */
 static void
 unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib,
     struct obj_idx *end)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	struct obj_idx *p;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	for (p = oib; p < end; p++) {
 		if (p->kidx == 0)
 			continue;
 
 		rw = find_op_rw(cmd + p->off, NULL, NULL);
 		KASSERT(rw != NULL, ("Unable to find handler for op %d",
 		    (cmd + p->off)->opcode));
 
 		/* Find & unref by existing idx */
 		no = rw->find_bykidx(ch, p->kidx);
 		KASSERT(no != NULL, ("Ref'd object %d disappeared", p->kidx));
 		no->refcnt--;
 	}
 }
 
 /*
  * Remove references from every object used in @rule.
  * Used at rule removal code.
  */
 static void
 unref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	ipfw_insn *cmd;
 	int cmdlen, l;
 	uint16_t kidx;
 	uint8_t subtype;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 		no = rw->find_bykidx(ch, kidx);
 
 		KASSERT(no != NULL, ("table id %d not found", kidx));
 		KASSERT(no->subtype == subtype,
 		    ("wrong type %d (%d) for table id %d",
 		    no->subtype, subtype, kidx));
 		KASSERT(no->refcnt > 0, ("refcount for table %d is %d",
 		    kidx, no->refcnt));
 
 		if (no->refcnt == 1 && rw->destroy_object != NULL)
 			rw->destroy_object(ch, no);
 		else
 			no->refcnt--;
 	}
 }
 
 
 /*
  * Find and reference object (if any) stored in instruction @cmd.
  *
  * Saves object info in @pidx, sets
  *  - @unresolved to 1 if object should exists but not found
  *
  * Returns non-zero value in case of error.
  */
 static int
 ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti,
     struct obj_idx *pidx, int *unresolved)
 {
 	struct named_object *no;
 	struct opcode_obj_rewrite *rw;
 	int error;
 
 	/* Check if this opcode is candidate for rewrite */
 	rw = find_op_rw(cmd, &ti->uidx, &ti->type);
 	if (rw == NULL)
 		return (0);
 
 	/* Need to rewrite. Save necessary fields */
 	pidx->uidx = ti->uidx;
 	pidx->type = ti->type;
 
 	/* Try to find referenced kernel object */
 	error = rw->find_byname(ch, ti, &no);
 	if (error != 0)
 		return (error);
 	if (no == NULL) {
 		/*
 		 * Report about unresolved object for automaic
 		 * creation.
 		 */
 		*unresolved = 1;
 		return (0);
 	}
 
 	/* Found. Bump refcount and update kidx. */
 	no->refcnt++;
 	rw->update(cmd, no->kidx);
 	return (0);
 }
 
 /*
  * Finds and bumps refcount for objects referenced by given @rule.
  * Auto-creates non-existing tables.
  * Fills in @oib array with userland/kernel indexes.
  *
  * Returns 0 on success.
  */
 static int
 ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
     struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti)
 {
 	struct obj_idx *pidx;
 	ipfw_insn *cmd;
 	int cmdlen, error, l, unresolved;
 
 	pidx = oib;
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	error = 0;
 
 	IPFW_UH_WLOCK(ch);
 
 	/* Increase refcount on each existing referenced table. */
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 		unresolved = 0;
 
 		error = ref_opcode_object(ch, cmd, ti, pidx, &unresolved);
 		if (error != 0)
 			break;
 		/*
 		 * Compatibility stuff for old clients:
 		 * prepare to automaitcally create non-existing objects.
 		 */
 		if (unresolved != 0) {
 			pidx->off = rule->cmd_len - l;
 			pidx++;
 		}
 	}
 
 	if (error != 0) {
 		/* Unref everything we have already done */
 		unref_oib_objects(ch, rule->cmd, oib, pidx);
 		IPFW_UH_WUNLOCK(ch);
 		return (error);
 	}
 	IPFW_UH_WUNLOCK(ch);
 
 	/* Perform auto-creation for non-existing objects */
 	if (pidx != oib)
 		error = create_objects_compat(ch, rule->cmd, oib, pidx, ti);
 
 	/* Calculate real number of dynamic objects */
 	ci->object_opcodes = (uint16_t)(pidx - oib);
 
 	return (error);
 }
 
 /*
  * Checks is opcode is referencing table of appropriate type.
  * Adds reference count for found table if true.
  * Rewrites user-supplied opcode values with kernel ones.
  *
  * Returns 0 on success and appropriate error code otherwise.
  */
 static int
 rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci)
 {
 	int error;
 	ipfw_insn *cmd;
 	uint8_t type;
 	struct obj_idx *p, *pidx_first, *pidx_last;
 	struct tid_info ti;
 
 	/*
 	 * Prepare an array for storing opcode indices.
 	 * Use stack allocation by default.
 	 */
 	if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) {
 		/* Stack */
 		pidx_first = ci->obuf;
 	} else
 		pidx_first = malloc(
 		    ci->object_opcodes * sizeof(struct obj_idx),
 		    M_IPFW, M_WAITOK | M_ZERO);
 
 	error = 0;
 	type = 0;
 	memset(&ti, 0, sizeof(ti));
 
 	/* Use set rule is assigned to. */
 	ti.set = ci->krule->set;
 	if (ci->ctlv != NULL) {
 		ti.tlvs = (void *)(ci->ctlv + 1);
 		ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv);
 	}
 
 	/* Reference all used tables and other objects */
 	error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti);
 	if (error != 0)
 		goto free;
 	/*
 	 * Note that ref_rule_objects() might have updated ci->object_opcodes
 	 * to reflect actual number of object opcodes.
 	 */
 
 	/* Perform rewrite of remaining opcodes */
 	p = pidx_first;
 	pidx_last = pidx_first + ci->object_opcodes;
 	for (p = pidx_first; p < pidx_last; p++) {
 		cmd = ci->krule->cmd + p->off;
 		update_opcode_kidx(cmd, p->kidx);
 	}
 
 free:
 	if (pidx_first != ci->obuf)
 		free(pidx_first, M_IPFW);
 
 	return (error);
 }
 
 /*
  * Adds one or more rules to ipfw @chain.
  * Data layout (version 0)(current):
  * Request:
  * [
  *   ip_fw3_opheader
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3)
  * ]
  * Reply:
  * [
  *   ip_fw3_opheader
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ]
  * ]
  *
  * Rules in reply are modified to store their actual ruleset number.
  *
  * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending
  * according to their idx field and there has to be no duplicates.
  * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending.
  * (*3) Each ip_fw structure needs to be aligned to u64 boundary.
  *
  * Returns 0 on success.
  */
 static int
 add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_ctlv *ctlv, *rtlv, *tstate;
 	ipfw_obj_ntlv *ntlv;
 	int clen, error, idx;
 	uint32_t count, read;
 	struct ip_fw_rule *r;
 	struct rule_check_info rci, *ci, *cbuf;
 	int i, rsize;
 
 	op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize);
 	ctlv = (ipfw_obj_ctlv *)(op3 + 1);
 
 	read = sizeof(ip_fw3_opheader);
 	rtlv = NULL;
 	tstate = NULL;
 	cbuf = NULL;
 	memset(&rci, 0, sizeof(struct rule_check_info));
 
 	if (read + sizeof(*ctlv) > sd->valsize)
 		return (EINVAL);
 
 	if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) {
 		clen = ctlv->head.length;
 		/* Check size and alignment */
 		if (clen > sd->valsize || clen < sizeof(*ctlv))
 			return (EINVAL);
 		if ((clen % sizeof(uint64_t)) != 0)
 			return (EINVAL);
 
 		/*
 		 * Some table names or other named objects.
 		 * Check for validness.
 		 */
 		count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv);
 		if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv))
 			return (EINVAL);
 
 		/*
 		 * Check each TLV.
 		 * Ensure TLVs are sorted ascending and
 		 * there are no duplicates.
 		 */
 		idx = -1;
 		ntlv = (ipfw_obj_ntlv *)(ctlv + 1);
 		while (count > 0) {
 			if (ntlv->head.length != sizeof(ipfw_obj_ntlv))
 				return (EINVAL);
 
 			error = ipfw_check_object_name_generic(ntlv->name);
 			if (error != 0)
 				return (error);
 
 			if (ntlv->idx <= idx)
 				return (EINVAL);
 
 			idx = ntlv->idx;
 			count--;
 			ntlv++;
 		}
 
 		tstate = ctlv;
 		read += ctlv->head.length;
 		ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length);
 	}
 
 	if (read + sizeof(*ctlv) > sd->valsize)
 		return (EINVAL);
 
 	if (ctlv->head.type == IPFW_TLV_RULE_LIST) {
 		clen = ctlv->head.length;
 		if (clen + read > sd->valsize || clen < sizeof(*ctlv))
 			return (EINVAL);
 		if ((clen % sizeof(uint64_t)) != 0)
 			return (EINVAL);
 
 		/*
 		 * TODO: Permit adding multiple rules at once
 		 */
 		if (ctlv->count != 1)
 			return (ENOTSUP);
 
 		clen -= sizeof(*ctlv);
 
 		if (ctlv->count > clen / sizeof(struct ip_fw_rule))
 			return (EINVAL);
 
 		/* Allocate state for each rule or use stack */
 		if (ctlv->count == 1) {
 			memset(&rci, 0, sizeof(struct rule_check_info));
 			cbuf = &rci;
 		} else
 			cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP,
 			    M_WAITOK | M_ZERO);
 		ci = cbuf;
 
 		/*
 		 * Check each rule for validness.
 		 * Ensure numbered rules are sorted ascending
 		 * and properly aligned
 		 */
 		idx = 0;
 		r = (struct ip_fw_rule *)(ctlv + 1);
 		count = 0;
 		error = 0;
 		while (clen > 0) {
 			rsize = roundup2(RULESIZE(r), sizeof(uint64_t));
 			if (rsize > clen || ctlv->count <= count) {
 				error = EINVAL;
 				break;
 			}
 
 			ci->ctlv = tstate;
 			error = check_ipfw_rule1(r, rsize, ci);
 			if (error != 0)
 				break;
 
 			/* Check sorting */
 			if (r->rulenum != 0 && r->rulenum < idx) {
 				printf("rulenum %d idx %d\n", r->rulenum, idx);
 				error = EINVAL;
 				break;
 			}
 			idx = r->rulenum;
 
 			ci->urule = (caddr_t)r;
 
 			rsize = roundup2(rsize, sizeof(uint64_t));
 			clen -= rsize;
 			r = (struct ip_fw_rule *)((caddr_t)r + rsize);
 			count++;
 			ci++;
 		}
 
 		if (ctlv->count != count || error != 0) {
 			if (cbuf != &rci)
 				free(cbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		rtlv = ctlv;
 		read += ctlv->head.length;
 		ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length);
 	}
 
 	if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) {
 		if (cbuf != NULL && cbuf != &rci)
 			free(cbuf, M_TEMP);
 		return (EINVAL);
 	}
 
 	/*
 	 * Passed rules seems to be valid.
 	 * Allocate storage and try to add them to chain.
 	 */
 	for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) {
 		clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule);
 		ci->krule = ipfw_alloc_rule(chain, clen);
 		import_rule1(ci);
 	}
 
 	if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) {
 		/* Free allocate krules */
 		for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++)
 			free_rule(ci->krule);
 	}
 
 	if (cbuf != NULL && cbuf != &rci)
 		free(cbuf, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Lists all sopts currently registered.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
  * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ]
  *
  * Returns 0 on success
  */
 static int
 dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	struct _ipfw_obj_lheader *olh;
 	ipfw_sopt_info *i;
 	struct ipfw_sopt_handler *sh;
 	uint32_t count, n, size;
 
 	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
 	if (olh == NULL)
 		return (EINVAL);
 	if (sd->valsize < olh->size)
 		return (EINVAL);
 
 	CTL3_LOCK();
 	count = ctl3_hsize;
 	size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader);
 
 	/* Fill in header regadless of buffer size */
 	olh->count = count;
 	olh->objsize = sizeof(ipfw_sopt_info);
 
 	if (size > olh->size) {
 		olh->size = size;
 		CTL3_UNLOCK();
 		return (ENOMEM);
 	}
 	olh->size = size;
 
 	for (n = 1; n <= count; n++) {
 		i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i));
 		KASSERT(i != NULL, ("previously checked buffer is not enough"));
 		sh = &ctl3_handlers[n];
 		i->opcode = sh->opcode;
 		i->version = sh->version;
 		i->refcnt = sh->refcnt;
 	}
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Compares two opcodes.
  * Used both in qsort() and bsearch().
  *
  * Returns 0 if match is found.
  */
 static int
 compare_opcodes(const void *_a, const void *_b)
 {
 	const struct opcode_obj_rewrite *a, *b;
 
 	a = (const struct opcode_obj_rewrite *)_a;
 	b = (const struct opcode_obj_rewrite *)_b;
 
 	if (a->opcode < b->opcode)
 		return (-1);
 	else if (a->opcode > b->opcode)
 		return (1);
 
 	return (0);
 }
 
 /*
  * XXX: Rewrite bsearch()
  */
 static int
 find_op_rw_range(uint16_t op, struct opcode_obj_rewrite **plo,
     struct opcode_obj_rewrite **phi)
 {
 	struct opcode_obj_rewrite *ctl3_max, *lo, *hi, h, *rw;
 
 	memset(&h, 0, sizeof(h));
 	h.opcode = op;
 
 	rw = (struct opcode_obj_rewrite *)bsearch(&h, ctl3_rewriters,
 	    ctl3_rsize, sizeof(h), compare_opcodes);
 	if (rw == NULL)
 		return (1);
 
 	/* Find the first element matching the same opcode */
 	lo = rw;
 	for ( ; lo > ctl3_rewriters && (lo - 1)->opcode == op; lo--)
 		;
 
 	/* Find the last element matching the same opcode */
 	hi = rw;
 	ctl3_max = ctl3_rewriters + ctl3_rsize;
 	for ( ; (hi + 1) < ctl3_max && (hi + 1)->opcode == op; hi++)
 		;
 
 	*plo = lo;
 	*phi = hi;
 
 	return (0);
 }
 
 /*
  * Finds opcode object rewriter based on @code.
  *
  * Returns pointer to handler or NULL.
  */
 static struct opcode_obj_rewrite *
 find_op_rw(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
 {
 	struct opcode_obj_rewrite *rw, *lo, *hi;
 	uint16_t uidx;
 	uint8_t subtype;
 
 	if (find_op_rw_range(cmd->opcode, &lo, &hi) != 0)
 		return (NULL);
 
 	for (rw = lo; rw <= hi; rw++) {
 		if (rw->classifier(cmd, &uidx, &subtype) == 0) {
 			if (puidx != NULL)
 				*puidx = uidx;
 			if (ptype != NULL)
 				*ptype = subtype;
 			return (rw);
 		}
 	}
 
 	return (NULL);
 }
 int
 classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx)
 {
 
-	if (find_op_rw(cmd, puidx, NULL) == 0)
+	if (find_op_rw(cmd, puidx, NULL) == NULL)
 		return (1);
 	return (0);
 }
 
 void
 update_opcode_kidx(ipfw_insn *cmd, uint16_t idx)
 {
 	struct opcode_obj_rewrite *rw;
 
 	rw = find_op_rw(cmd, NULL, NULL);
 	KASSERT(rw != NULL, ("No handler to update opcode %d", cmd->opcode));
 	rw->update(cmd, idx);
 }
 
 void
 ipfw_init_obj_rewriter()
 {
 
 	ctl3_rewriters = NULL;
 	ctl3_rsize = 0;
 }
 
 void
 ipfw_destroy_obj_rewriter()
 {
 
 	if (ctl3_rewriters != NULL)
 		free(ctl3_rewriters, M_IPFW);
 	ctl3_rewriters = NULL;
 	ctl3_rsize = 0;
 }
 
 /*
  * Adds one or more opcode object rewrite handlers to the global array.
  * Function may sleep.
  */
 void
 ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count)
 {
 	size_t sz;
 	struct opcode_obj_rewrite *tmp;
 
 	CTL3_LOCK();
 
 	for (;;) {
 		sz = ctl3_rsize + count;
 		CTL3_UNLOCK();
 		tmp = malloc(sizeof(*rw) * sz, M_IPFW, M_WAITOK | M_ZERO);
 		CTL3_LOCK();
 		if (ctl3_rsize + count <= sz)
 			break;
 
 		/* Retry */
 		free(tmp, M_IPFW);
 	}
 
 	/* Merge old & new arrays */
 	sz = ctl3_rsize + count;
 	memcpy(tmp, ctl3_rewriters, ctl3_rsize * sizeof(*rw));
 	memcpy(&tmp[ctl3_rsize], rw, count * sizeof(*rw));
 	qsort(tmp, sz, sizeof(*rw), compare_opcodes);
 	/* Switch new and free old */
 	if (ctl3_rewriters != NULL)
 		free(ctl3_rewriters, M_IPFW);
 	ctl3_rewriters = tmp;
 	ctl3_rsize = sz;
 
 	CTL3_UNLOCK();
 }
 
 /*
  * Removes one or more object rewrite handlers from the global array.
  */
 int
 ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count)
 {
 	size_t sz;
 	struct opcode_obj_rewrite *ctl3_max, *ktmp, *lo, *hi;
 	int i;
 
 	CTL3_LOCK();
 
 	for (i = 0; i < count; i++) {
 		if (find_op_rw_range(rw[i].opcode, &lo, &hi) != 0)
 			continue;
 
 		for (ktmp = lo; ktmp <= hi; ktmp++) {
 			if (ktmp->classifier != rw[i].classifier)
 				continue;
 
 			ctl3_max = ctl3_rewriters + ctl3_rsize;
 			sz = (ctl3_max - (ktmp + 1)) * sizeof(*ktmp);
 			memmove(ktmp, ktmp + 1, sz);
 			ctl3_rsize--;
 			break;
 		}
 
 	}
 
 	if (ctl3_rsize == 0) {
 		if (ctl3_rewriters != NULL)
 			free(ctl3_rewriters, M_IPFW);
 		ctl3_rewriters = NULL;
 	}
 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 static int
 export_objhash_ntlv_internal(struct namedobj_instance *ni,
     struct named_object *no, void *arg)
 {
 	struct sockopt_data *sd;
 	ipfw_obj_ntlv *ntlv;
 
 	sd = (struct sockopt_data *)arg;
 	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
 	if (ntlv == NULL)
 		return (ENOMEM);
 	ipfw_export_obj_ntlv(no, ntlv);
 	return (0);
 }
 
 /*
  * Lists all service objects.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ] size = ipfw_obj_lheader.size
  * Reply: [ ipfw_obj_lheader [ ipfw_obj_ntlv x N ] (optional) ]
  * Returns 0 on success
  */
 static int
 dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_lheader *hdr;
 	int count;
 
 	hdr = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr));
 	if (hdr == NULL)
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	count = ipfw_objhash_count(CHAIN_TO_SRV(chain));
 	hdr->size = sizeof(ipfw_obj_lheader) + count * sizeof(ipfw_obj_ntlv);
 	if (sd->valsize < hdr->size) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 	hdr->count = count;
 	hdr->objsize = sizeof(ipfw_obj_ntlv);
 	if (count > 0)
 		ipfw_objhash_foreach(CHAIN_TO_SRV(chain),
 		    export_objhash_ntlv_internal, sd);
 	IPFW_UH_RUNLOCK(chain);
 	return (0);
 }
 
 /*
  * Compares two sopt handlers (code, version and handler ptr).
  * Used both as qsort() and bsearch().
  * Does not compare handler for latter case.
  *
  * Returns 0 if match is found.
  */
 static int
 compare_sh(const void *_a, const void *_b)
 {
 	const struct ipfw_sopt_handler *a, *b;
 
 	a = (const struct ipfw_sopt_handler *)_a;
 	b = (const struct ipfw_sopt_handler *)_b;
 
 	if (a->opcode < b->opcode)
 		return (-1);
 	else if (a->opcode > b->opcode)
 		return (1);
 
 	if (a->version < b->version)
 		return (-1);
 	else if (a->version > b->version)
 		return (1);
 
 	/* bsearch helper */
 	if (a->handler == NULL)
 		return (0);
 
 	if ((uintptr_t)a->handler < (uintptr_t)b->handler)
 		return (-1);
 	else if ((uintptr_t)a->handler > (uintptr_t)b->handler)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Finds sopt handler based on @code and @version.
  *
  * Returns pointer to handler or NULL.
  */
 static struct ipfw_sopt_handler *
 find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler)
 {
 	struct ipfw_sopt_handler *sh, h;
 
 	memset(&h, 0, sizeof(h));
 	h.opcode = code;
 	h.version = version;
 	h.handler = handler;
 
 	sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers,
 	    ctl3_hsize, sizeof(h), compare_sh);
 
 	return (sh);
 }
 
 static int
 find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh)
 {
 	struct ipfw_sopt_handler *sh;
 
 	CTL3_LOCK();
 	if ((sh = find_sh(opcode, version, NULL)) == NULL) {
 		CTL3_UNLOCK();
 		printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n",
 		    opcode, version);
 		return (EINVAL);
 	}
 	sh->refcnt++;
 	ctl3_refct++;
 	/* Copy handler data to requested buffer */
 	*psh = *sh; 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 static void
 find_unref_sh(struct ipfw_sopt_handler *psh)
 {
 	struct ipfw_sopt_handler *sh;
 
 	CTL3_LOCK();
 	sh = find_sh(psh->opcode, psh->version, NULL);
 	KASSERT(sh != NULL, ("ctl3 handler disappeared"));
 	sh->refcnt--;
 	ctl3_refct--;
 	CTL3_UNLOCK();
 }
 
 void
 ipfw_init_sopt_handler()
 {
 
 	CTL3_LOCK_INIT();
 	IPFW_ADD_SOPT_HANDLER(1, scodes);
 }
 
 void
 ipfw_destroy_sopt_handler()
 {
 
 	IPFW_DEL_SOPT_HANDLER(1, scodes);
 	CTL3_LOCK_DESTROY();
 }
 
 /*
  * Adds one or more sockopt handlers to the global array.
  * Function may sleep.
  */
 void
 ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count)
 {
 	size_t sz;
 	struct ipfw_sopt_handler *tmp;
 
 	CTL3_LOCK();
 
 	for (;;) {
 		sz = ctl3_hsize + count;
 		CTL3_UNLOCK();
 		tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO);
 		CTL3_LOCK();
 		if (ctl3_hsize + count <= sz)
 			break;
 
 		/* Retry */
 		free(tmp, M_IPFW);
 	}
 
 	/* Merge old & new arrays */
 	sz = ctl3_hsize + count;
 	memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh));
 	memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh));
 	qsort(tmp, sz, sizeof(*sh), compare_sh);
 	/* Switch new and free old */
 	if (ctl3_handlers != NULL)
 		free(ctl3_handlers, M_IPFW);
 	ctl3_handlers = tmp;
 	ctl3_hsize = sz;
 	ctl3_gencnt++;
 
 	CTL3_UNLOCK();
 }
 
 /*
  * Removes one or more sockopt handlers from the global array.
  */
 int
 ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count)
 {
 	size_t sz;
 	struct ipfw_sopt_handler *tmp, *h;
 	int i;
 
 	CTL3_LOCK();
 
 	for (i = 0; i < count; i++) {
 		tmp = &sh[i];
 		h = find_sh(tmp->opcode, tmp->version, tmp->handler);
 		if (h == NULL)
 			continue;
 
 		sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h);
 		memmove(h, h + 1, sz);
 		ctl3_hsize--;
 	}
 
 	if (ctl3_hsize == 0) {
 		if (ctl3_handlers != NULL)
 			free(ctl3_handlers, M_IPFW);
 		ctl3_handlers = NULL;
 	}
 
 	ctl3_gencnt++;
 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Writes data accumulated in @sd to sockopt buffer.
  * Zeroes internal @sd buffer.
  */
 static int
 ipfw_flush_sopt_data(struct sockopt_data *sd)
 {
 	struct sockopt *sopt;
 	int error;
 	size_t sz;
 
 	sz = sd->koff;
 	if (sz == 0)
 		return (0);
 
 	sopt = sd->sopt;
 
 	if (sopt->sopt_dir == SOPT_GET) {
 		error = copyout(sd->kbuf, sopt->sopt_val, sz);
 		if (error != 0)
 			return (error);
 	}
 
 	memset(sd->kbuf, 0, sd->ksize);
 	sd->ktotal += sz;
 	sd->koff = 0;
 	if (sd->ktotal + sd->ksize < sd->valsize)
 		sd->kavail = sd->ksize;
 	else
 		sd->kavail = sd->valsize - sd->ktotal;
 
 	/* Update sopt buffer data */
 	sopt->sopt_valsize = sd->ktotal;
 	sopt->sopt_val = sd->sopt_val + sd->ktotal;
 
 	return (0);
 }
 
 /*
  * Ensures that @sd buffer has contiguous @neeeded number of
  * bytes.
  *
  * Returns pointer to requested space or NULL.
  */
 caddr_t
 ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed)
 {
 	int error;
 	caddr_t addr;
 
 	if (sd->kavail < needed) {
 		/*
 		 * Flush data and try another time.
 		 */
 		error = ipfw_flush_sopt_data(sd);
 
 		if (sd->kavail < needed || error != 0)
 			return (NULL);
 	}
 
 	addr = sd->kbuf + sd->koff;
 	sd->koff += needed;
 	sd->kavail -= needed;
 	return (addr);
 }
 
 /*
  * Requests @needed contiguous bytes from @sd buffer.
  * Function is used to notify subsystem that we are
  * interesed in first @needed bytes (request header)
  * and the rest buffer can be safely zeroed.
  *
  * Returns pointer to requested space or NULL.
  */
 caddr_t
 ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed)
 {
 	caddr_t addr;
 
 	if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL)
 		return (NULL);
 
 	if (sd->kavail > 0)
 		memset(sd->kbuf + sd->koff, 0, sd->kavail);
 	
 	return (addr);
 }
 
 /*
  * New sockopt handler.
  */
 int
 ipfw_ctl3(struct sockopt *sopt)
 {
 	int error, locked;
 	size_t size, valsize;
 	struct ip_fw_chain *chain;
 	char xbuf[256];
 	struct sockopt_data sdata;
 	struct ipfw_sopt_handler h;
 	ip_fw3_opheader *op3 = NULL;
 
 	error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
 	if (error != 0)
 		return (error);
 
 	if (sopt->sopt_name != IP_FW3)
 		return (ipfw_ctl(sopt));
 
 	chain = &V_layer3_chain;
 	error = 0;
 
 	/* Save original valsize before it is altered via sooptcopyin() */
 	valsize = sopt->sopt_valsize;
 	memset(&sdata, 0, sizeof(sdata));
 	/* Read op3 header first to determine actual operation */
 	op3 = (ip_fw3_opheader *)xbuf;
 	error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3));
 	if (error != 0)
 		return (error);
 	sopt->sopt_valsize = valsize;
 
 	/*
 	 * Find and reference command.
 	 */
 	error = find_ref_sh(op3->opcode, op3->version, &h);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Disallow modifications in really-really secure mode, but still allow
 	 * the logging counters to be reset.
 	 */
 	if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) {
 		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error != 0) {
 			find_unref_sh(&h);
 			return (error);
 		}
 	}
 
 	/*
 	 * Fill in sockopt_data structure that may be useful for
 	 * IP_FW3 get requests.
 	 */
 	locked = 0;
 	if (valsize <= sizeof(xbuf)) {
 		/* use on-stack buffer */
 		sdata.kbuf = xbuf;
 		sdata.ksize = sizeof(xbuf);
 		sdata.kavail = valsize;
 	} else {
 
 		/*
 		 * Determine opcode type/buffer size:
 		 * allocate sliding-window buf for data export or
 		 * contiguous buffer for special ops.
 		 */
 		if ((h.dir & HDIR_SET) != 0) {
 			/* Set request. Allocate contigous buffer. */
 			if (valsize > CTL3_LARGEBUF) {
 				find_unref_sh(&h);
 				return (EFBIG);
 			}
 
 			size = valsize;
 		} else {
 			/* Get request. Allocate sliding window buffer */
 			size = (valsize<CTL3_SMALLBUF) ? valsize:CTL3_SMALLBUF;
 
 			if (size < valsize) {
 				/* We have to wire user buffer */
 				error = vslock(sopt->sopt_val, valsize);
 				if (error != 0)
 					return (error);
 				locked = 1;
 			}
 		}
 
 		sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 		sdata.ksize = size;
 		sdata.kavail = size;
 	}
 
 	sdata.sopt = sopt;
 	sdata.sopt_val = sopt->sopt_val;
 	sdata.valsize = valsize;
 
 	/*
 	 * Copy either all request (if valsize < bsize_max)
 	 * or first bsize_max bytes to guarantee most consumers
 	 * that all necessary data has been copied).
 	 * Anyway, copy not less than sizeof(ip_fw3_opheader).
 	 */
 	if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize,
 	    sizeof(ip_fw3_opheader))) != 0)
 		return (error);
 	op3 = (ip_fw3_opheader *)sdata.kbuf;
 
 	/* Finally, run handler */
 	error = h.handler(chain, op3, &sdata);
 	find_unref_sh(&h);
 
 	/* Flush state and free buffers */
 	if (error == 0)
 		error = ipfw_flush_sopt_data(&sdata);
 	else
 		ipfw_flush_sopt_data(&sdata);
 
 	if (locked != 0)
 		vsunlock(sdata.sopt_val, valsize);
 
 	/* Restore original pointer and set number of bytes written */
 	sopt->sopt_val = sdata.sopt_val;
 	sopt->sopt_valsize = sdata.ktotal;
 	if (sdata.kbuf != xbuf)
 		free(sdata.kbuf, M_TEMP);
 
 	return (error);
 }
 
 /**
  * {set|get}sockopt parser.
  */
 int
 ipfw_ctl(struct sockopt *sopt)
 {
 #define	RULE_MAXSIZE	(512*sizeof(u_int32_t))
 	int error;
 	size_t size, valsize;
 	struct ip_fw *buf;
 	struct ip_fw_rule0 *rule;
 	struct ip_fw_chain *chain;
 	u_int32_t rulenum[2];
 	uint32_t opt;
 	struct rule_check_info ci;
 	IPFW_RLOCK_TRACKER;
 
 	chain = &V_layer3_chain;
 	error = 0;
 
 	/* Save original valsize before it is altered via sooptcopyin() */
 	valsize = sopt->sopt_valsize;
 	opt = sopt->sopt_name;
 
 	/*
 	 * Disallow modifications in really-really secure mode, but still allow
 	 * the logging counters to be reset.
 	 */
 	if (opt == IP_FW_ADD ||
 	    (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) {
 		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error != 0)
 			return (error);
 	}
 
 	switch (opt) {
 	case IP_FW_GET:
 		/*
 		 * pass up a copy of the current rules. Static rules
 		 * come first (the last of which has number IPFW_DEFAULT_RULE),
 		 * followed by a possibly empty list of dynamic rule.
 		 * The last dynamic rule has NULL in the "next" field.
 		 *
 		 * Note that the calculated size is used to bound the
 		 * amount of data returned to the user.  The rule set may
 		 * change between calculating the size and returning the
 		 * data in which case we'll just return what fits.
 		 */
 		for (;;) {
 			int len = 0, want;
 
 			size = chain->static_len;
 			size += ipfw_dyn_len();
 			if (size >= sopt->sopt_valsize)
 				break;
 			buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 			IPFW_UH_RLOCK(chain);
 			/* check again how much space we need */
 			want = chain->static_len + ipfw_dyn_len();
 			if (size >= want)
 				len = ipfw_getrules(chain, buf, size);
 			IPFW_UH_RUNLOCK(chain);
 			if (size >= want)
 				error = sooptcopyout(sopt, buf, len);
 			free(buf, M_TEMP);
 			if (size >= want)
 				break;
 		}
 		break;
 
 	case IP_FW_FLUSH:
 		/* locking is done within del_entry() */
 		error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
 		break;
 
 	case IP_FW_ADD:
 		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
 		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
 			sizeof(struct ip_fw7) );
 
 		memset(&ci, 0, sizeof(struct rule_check_info));
 
 		/*
 		 * If the size of commands equals RULESIZE7 then we assume
 		 * a FreeBSD7.2 binary is talking to us (set is7=1).
 		 * is7 is persistent so the next 'ipfw list' command
 		 * will use this format.
 		 * NOTE: If wrong version is guessed (this can happen if
 		 *       the first ipfw command is 'ipfw [pipe] list')
 		 *       the ipfw binary may crash or loop infinitly...
 		 */
 		size = sopt->sopt_valsize;
 		if (size == RULESIZE7(rule)) {
 		    is7 = 1;
 		    error = convert_rule_to_8(rule);
 		    if (error) {
 			free(rule, M_TEMP);
 			return error;
 		    }
 		    size = RULESIZE(rule);
 		} else
 		    is7 = 0;
 		if (error == 0)
 			error = check_ipfw_rule0(rule, size, &ci);
 		if (error == 0) {
 			/* locking is done within add_rule() */
 			struct ip_fw *krule;
 			krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule));
 			ci.urule = (caddr_t)rule;
 			ci.krule = krule;
 			import_rule0(&ci);
 			error = commit_rules(chain, &ci, 1);
 			if (error != 0)
 				free_rule(ci.krule);
 			else if (sopt->sopt_dir == SOPT_GET) {
 				if (is7) {
 					error = convert_rule_to_7(rule);
 					size = RULESIZE7(rule);
 					if (error) {
 						free(rule, M_TEMP);
 						return error;
 					}
 				}
 				error = sooptcopyout(sopt, rule, size);
 			}
 		}
 		free(rule, M_TEMP);
 		break;
 
 	case IP_FW_DEL:
 		/*
 		 * IP_FW_DEL is used for deleting single rules or sets,
 		 * and (ab)used to atomically manipulate sets. Argument size
 		 * is used to distinguish between the two:
 		 *    sizeof(u_int32_t)
 		 *	delete single rule or set of rules,
 		 *	or reassign rules (or sets) to a different set.
 		 *    2*sizeof(u_int32_t)
 		 *	atomic disable/enable sets.
 		 *	first u_int32_t contains sets to be disabled,
 		 *	second u_int32_t contains sets to be enabled.
 		 */
 		error = sooptcopyin(sopt, rulenum,
 			2*sizeof(u_int32_t), sizeof(u_int32_t));
 		if (error)
 			break;
 		size = sopt->sopt_valsize;
 		if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
 			/* delete or reassign, locking done in del_entry() */
 			error = del_entry(chain, rulenum[0]);
 		} else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
 			IPFW_UH_WLOCK(chain);
 			V_set_disable =
 			    (V_set_disable | rulenum[0]) & ~rulenum[1] &
 			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
 			IPFW_UH_WUNLOCK(chain);
 		} else
 			error = EINVAL;
 		break;
 
 	case IP_FW_ZERO:
 	case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
 		rulenum[0] = 0;
 		if (sopt->sopt_val != 0) {
 		    error = sooptcopyin(sopt, rulenum,
 			    sizeof(u_int32_t), sizeof(u_int32_t));
 		    if (error)
 			break;
 		}
 		error = zero_entry(chain, rulenum[0],
 			sopt->sopt_name == IP_FW_RESETLOG);
 		break;
 
 	/*--- TABLE opcodes ---*/
 	case IP_FW_TABLE_ADD:
 	case IP_FW_TABLE_DEL:
 		{
 			ipfw_table_entry ent;
 			struct tentry_info tei;
 			struct tid_info ti;
 			struct table_value v;
 
 			error = sooptcopyin(sopt, &ent,
 			    sizeof(ent), sizeof(ent));
 			if (error)
 				break;
 
 			memset(&tei, 0, sizeof(tei));
 			tei.paddr = &ent.addr;
 			tei.subtype = AF_INET;
 			tei.masklen = ent.masklen;
 			ipfw_import_table_value_legacy(ent.value, &v);
 			tei.pvalue = &v;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = ent.tbl;
 			ti.type = IPFW_TABLE_CIDR;
 
 			error = (opt == IP_FW_TABLE_ADD) ?
 			    add_table_entry(chain, &ti, &tei, 0, 1) :
 			    del_table_entry(chain, &ti, &tei, 0, 1);
 		}
 		break;
 
 
 	case IP_FW_TABLE_FLUSH:
 		{
 			u_int16_t tbl;
 			struct tid_info ti;
 
 			error = sooptcopyin(sopt, &tbl,
 			    sizeof(tbl), sizeof(tbl));
 			if (error)
 				break;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl;
 			error = flush_table(chain, &ti);
 		}
 		break;
 
 	case IP_FW_TABLE_GETSIZE:
 		{
 			u_int32_t tbl, cnt;
 			struct tid_info ti;
 
 			if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
 			    sizeof(tbl))))
 				break;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl;
 			IPFW_RLOCK(chain);
 			error = ipfw_count_table(chain, &ti, &cnt);
 			IPFW_RUNLOCK(chain);
 			if (error)
 				break;
 			error = sooptcopyout(sopt, &cnt, sizeof(cnt));
 		}
 		break;
 
 	case IP_FW_TABLE_LIST:
 		{
 			ipfw_table *tbl;
 			struct tid_info ti;
 
 			if (sopt->sopt_valsize < sizeof(*tbl)) {
 				error = EINVAL;
 				break;
 			}
 			size = sopt->sopt_valsize;
 			tbl = malloc(size, M_TEMP, M_WAITOK);
 			error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
 			if (error) {
 				free(tbl, M_TEMP);
 				break;
 			}
 			tbl->size = (size - sizeof(*tbl)) /
 			    sizeof(ipfw_table_entry);
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl->tbl;
 			IPFW_RLOCK(chain);
 			error = ipfw_dump_table_legacy(chain, &ti, tbl);
 			IPFW_RUNLOCK(chain);
 			if (error) {
 				free(tbl, M_TEMP);
 				break;
 			}
 			error = sooptcopyout(sopt, tbl, size);
 			free(tbl, M_TEMP);
 		}
 		break;
 
 	/*--- NAT operations are protected by the IPFW_LOCK ---*/
 	case IP_FW_NAT_CFG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_cfg_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_CFG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_DEL:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_del_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_DEL: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_GET_CONFIG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_get_cfg_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_GET_CFG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_GET_LOG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_get_log_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_GET_LOG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	default:
 		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
 		error = EINVAL;
 	}
 
 	return (error);
 #undef RULE_MAXSIZE
 }
 #define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
 
 /* Functions to convert rules 7.2 <==> 8.0 */
 static int
 convert_rule_to_7(struct ip_fw_rule0 *rule)
 {
 	/* Used to modify original rule */
 	struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
 	/* copy of original rule, version 8 */
 	struct ip_fw_rule0 *tmp;
 
 	/* Used to copy commands */
 	ipfw_insn *ccmd, *dst;
 	int ll = 0, ccmdlen = 0;
 
 	tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
 	if (tmp == NULL) {
 		return 1; //XXX error
 	}
 	bcopy(rule, tmp, RULE_MAXSIZE);
 
 	/* Copy fields */
 	//rule7->_pad = tmp->_pad;
 	rule7->set = tmp->set;
 	rule7->rulenum = tmp->rulenum;
 	rule7->cmd_len = tmp->cmd_len;
 	rule7->act_ofs = tmp->act_ofs;
 	rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
 	rule7->cmd_len = tmp->cmd_len;
 	rule7->pcnt = tmp->pcnt;
 	rule7->bcnt = tmp->bcnt;
 	rule7->timestamp = tmp->timestamp;
 
 	/* Copy commands */
 	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
 			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
 		ccmdlen = F_LEN(ccmd);
 
 		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
 
 		if (dst->opcode > O_NAT)
 			/* O_REASS doesn't exists in 7.2 version, so
 			 * decrement opcode if it is after O_REASS
 			 */
 			dst->opcode--;
 
 		if (ccmdlen > ll) {
 			printf("ipfw: opcode %d size truncated\n",
 				ccmd->opcode);
 			return EINVAL;
 		}
 	}
 	free(tmp, M_TEMP);
 
 	return 0;
 }
 
 static int
 convert_rule_to_8(struct ip_fw_rule0 *rule)
 {
 	/* Used to modify original rule */
 	struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
 
 	/* Used to copy commands */
 	ipfw_insn *ccmd, *dst;
 	int ll = 0, ccmdlen = 0;
 
 	/* Copy of original rule */
 	struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
 	if (tmp == NULL) {
 		return 1; //XXX error
 	}
 
 	bcopy(rule7, tmp, RULE_MAXSIZE);
 
 	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
 			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
 		ccmdlen = F_LEN(ccmd);
 		
 		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
 
 		if (dst->opcode > O_NAT)
 			/* O_REASS doesn't exists in 7.2 version, so
 			 * increment opcode if it is after O_REASS
 			 */
 			dst->opcode++;
 
 		if (ccmdlen > ll) {
 			printf("ipfw: opcode %d size truncated\n",
 			    ccmd->opcode);
 			return EINVAL;
 		}
 	}
 
 	rule->_pad = tmp->_pad;
 	rule->set = tmp->set;
 	rule->rulenum = tmp->rulenum;
 	rule->cmd_len = tmp->cmd_len;
 	rule->act_ofs = tmp->act_ofs;
 	rule->next_rule = (struct ip_fw *)tmp->next_rule;
 	rule->cmd_len = tmp->cmd_len;
 	rule->id = 0; /* XXX see if is ok = 0 */
 	rule->pcnt = tmp->pcnt;
 	rule->bcnt = tmp->bcnt;
 	rule->timestamp = tmp->timestamp;
 
 	free (tmp, M_TEMP);
 	return 0;
 }
 
 /*
  * Named object api
  *
  */
 
 void
 ipfw_init_srv(struct ip_fw_chain *ch)
 {
 
 	ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT);
 	ch->srvstate = malloc(sizeof(void *) * IPFW_OBJECTS_DEFAULT,
 	    M_IPFW, M_WAITOK | M_ZERO);
 }
 
 void
 ipfw_destroy_srv(struct ip_fw_chain *ch)
 {
 
 	free(ch->srvstate, M_IPFW);
 	ipfw_objhash_destroy(ch->srvmap);
 }
 
 /*
  * Allocate new bitmask which can be used to enlarge/shrink
  * named instance index.
  */
 void
 ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks)
 {
 	size_t size;
 	int max_blocks;
 	u_long *idx_mask;
 
 	KASSERT((items % BLOCK_ITEMS) == 0,
 	   ("bitmask size needs to power of 2 and greater or equal to %zu",
 	    BLOCK_ITEMS));
 
 	max_blocks = items / BLOCK_ITEMS;
 	size = items / 8;
 	idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK);
 	/* Mark all as free */
 	memset(idx_mask, 0xFF, size * IPFW_MAX_SETS);
 	*idx_mask &= ~(u_long)1; /* Skip index 0 */
 
 	*idx = idx_mask;
 	*pblocks = max_blocks;
 }
 
 /*
  * Copy current bitmask index to new one.
  */
 void
 ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks)
 {
 	int old_blocks, new_blocks;
 	u_long *old_idx, *new_idx;
 	int i;
 
 	old_idx = ni->idx_mask;
 	old_blocks = ni->max_blocks;
 	new_idx = *idx;
 	new_blocks = *blocks;
 
 	for (i = 0; i < IPFW_MAX_SETS; i++) {
 		memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i],
 		    old_blocks * sizeof(u_long));
 	}
 }
 
 /*
  * Swaps current @ni index with new one.
  */
 void
 ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks)
 {
 	int old_blocks;
 	u_long *old_idx;
 
 	old_idx = ni->idx_mask;
 	old_blocks = ni->max_blocks;
 
 	ni->idx_mask = *idx;
 	ni->max_blocks = *blocks;
 
 	/* Save old values */
 	*idx = old_idx;
 	*blocks = old_blocks;
 }
 
 void
 ipfw_objhash_bitmap_free(void *idx, int blocks)
 {
 
 	free(idx, M_IPFW);
 }
 
 /*
  * Creates named hash instance.
  * Must be called without holding any locks.
  * Return pointer to new instance.
  */
 struct namedobj_instance *
 ipfw_objhash_create(uint32_t items)
 {
 	struct namedobj_instance *ni;
 	int i;
 	size_t size;
 
 	size = sizeof(struct namedobj_instance) +
 	    sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE +
 	    sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE;
 
 	ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO);
 	ni->nn_size = NAMEDOBJ_HASH_SIZE;
 	ni->nv_size = NAMEDOBJ_HASH_SIZE;
 
 	ni->names = (struct namedobjects_head *)(ni +1);
 	ni->values = &ni->names[ni->nn_size];
 
 	for (i = 0; i < ni->nn_size; i++)
 		TAILQ_INIT(&ni->names[i]);
 
 	for (i = 0; i < ni->nv_size; i++)
 		TAILQ_INIT(&ni->values[i]);
 
 	/* Set default hashing/comparison functions */
 	ni->hash_f = objhash_hash_name;
 	ni->cmp_f = objhash_cmp_name;
 
 	/* Allocate bitmask separately due to possible resize */
 	ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, &ni->max_blocks);
 
 	return (ni);
 }
 
 void
 ipfw_objhash_destroy(struct namedobj_instance *ni)
 {
 
 	free(ni->idx_mask, M_IPFW);
 	free(ni, M_IPFW);
 }
 
 void
 ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f,
     objhash_cmp_f *cmp_f)
 {
 
 	ni->hash_f = hash_f;
 	ni->cmp_f = cmp_f;
 }
 
 static uint32_t
 objhash_hash_name(struct namedobj_instance *ni, const void *name, uint32_t set)
 {
 
 	return (fnv_32_str((const char *)name, FNV1_32_INIT));
 }
 
 static int
 objhash_cmp_name(struct named_object *no, const void *name, uint32_t set)
 {
 
 	if ((strcmp(no->name, (const char *)name) == 0) && (no->set == set))
 		return (0);
 
 	return (1);
 }
 
 static uint32_t
 objhash_hash_idx(struct namedobj_instance *ni, uint32_t val)
 {
 	uint32_t v;
 
 	v = val % (ni->nv_size - 1);
 
 	return (v);
 }
 
 struct named_object *
 ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, name, set) % ni->nn_size;
 	
 	TAILQ_FOREACH(no, &ni->names[hash], nn_next) {
 		if (ni->cmp_f(no, name, set) == 0)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 /*
  * Find named object by @uid.
  * Check @tlvs for valid data inside.
  *
  * Returns pointer to found TLV or NULL.
  */
 ipfw_obj_ntlv *
 ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv)
 {
 	ipfw_obj_ntlv *ntlv;
 	uintptr_t pa, pe;
 	int l;
 
 	pa = (uintptr_t)tlvs;
 	pe = pa + len;
 	l = 0;
 	for (; pa < pe; pa += l) {
 		ntlv = (ipfw_obj_ntlv *)pa;
 		l = ntlv->head.length;
 
 		if (l != sizeof(*ntlv))
 			return (NULL);
 
 		if (ntlv->idx != uidx)
 			continue;
 		/*
 		 * When userland has specified zero TLV type, do
 		 * not compare it with eltv. In some cases userland
 		 * doesn't know what type should it have. Use only
 		 * uidx and name for search named_object.
 		 */
 		if (ntlv->head.type != 0 &&
 		    ntlv->head.type != (uint16_t)etlv)
 			continue;
 
 		if (ipfw_check_object_name_generic(ntlv->name) != 0)
 			return (NULL);
 
 		return (ntlv);
 	}
 
 	return (NULL);
 }
 
 /*
  * Finds object config based on either legacy index
  * or name in ntlv.
  * Note @ti structure contains unchecked data from userland.
  *
  * Returns 0 in success and fills in @pno with found config
  */
 int
 ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti,
     uint32_t etlv, struct named_object **pno)
 {
 	char *name;
 	ipfw_obj_ntlv *ntlv;
 	uint32_t set;
 
 	if (ti->tlvs == NULL)
 		return (EINVAL);
 
 	ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, etlv);
 	if (ntlv == NULL)
 		return (EINVAL);
 	name = ntlv->name;
 
 	/*
 	 * Use set provided by @ti instead of @ntlv one.
 	 * This is needed due to different sets behavior
 	 * controlled by V_fw_tables_sets.
 	 */
 	set = ti->set;
 	*pno = ipfw_objhash_lookup_name(ni, set, name);
 	if (*pno == NULL)
 		return (ESRCH);
 	return (0);
 }
 
 /*
  * Find named object by name, considering also its TLV type.
  */
 struct named_object *
 ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set,
     uint32_t type, const char *name)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, name, set) % ni->nn_size;
 
 	TAILQ_FOREACH(no, &ni->names[hash], nn_next) {
 		if (ni->cmp_f(no, name, set) == 0 &&
 		    no->etlv == (uint16_t)type)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 struct named_object *
 ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = objhash_hash_idx(ni, kidx);
 	
 	TAILQ_FOREACH(no, &ni->values[hash], nv_next) {
 		if (no->kidx == kidx)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 int
 ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a,
     struct named_object *b)
 {
 
 	if ((strcmp(a->name, b->name) == 0) && a->set == b->set)
 		return (1);
 
 	return (0);
 }
 
 void
 ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no)
 {
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size;
 	TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next);
 
 	hash = objhash_hash_idx(ni, no->kidx);
 	TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next);
 
 	ni->count++;
 }
 
 void
 ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no)
 {
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size;
 	TAILQ_REMOVE(&ni->names[hash], no, nn_next);
 
 	hash = objhash_hash_idx(ni, no->kidx);
 	TAILQ_REMOVE(&ni->values[hash], no, nv_next);
 
 	ni->count--;
 }
 
 uint32_t
 ipfw_objhash_count(struct namedobj_instance *ni)
 {
 
 	return (ni->count);
 }
 
 uint32_t
 ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type)
 {
 	struct named_object *no;
 	uint32_t count;
 	int i;
 
 	count = 0;
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH(no, &ni->names[i], nn_next) {
 			if (no->etlv == type)
 				count++;
 		}
 	}
 	return (count);
 }
 
 /*
  * Runs @func for each found named object.
  * It is safe to delete objects from callback
  */
 int
 ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg)
 {
 	struct named_object *no, *no_tmp;
 	int i, ret;
 
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) {
 			ret = f(ni, no, arg);
 			if (ret != 0)
 				return (ret);
 		}
 	}
 	return (0);
 }
 
 /*
  * Runs @f for each found named object with type @type.
  * It is safe to delete objects from callback
  */
 int
 ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f,
     void *arg, uint16_t type)
 {
 	struct named_object *no, *no_tmp;
 	int i, ret;
 
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) {
 			if (no->etlv != type)
 				continue;
 			ret = f(ni, no, arg);
 			if (ret != 0)
 				return (ret);
 		}
 	}
 	return (0);
 }
 
 /*
  * Removes index from given set.
  * Returns 0 on success.
  */
 int
 ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx)
 {
 	u_long *mask;
 	int i, v;
 
 	i = idx / BLOCK_ITEMS;
 	v = idx % BLOCK_ITEMS;
 
 	if (i >= ni->max_blocks)
 		return (1);
 
 	mask = &ni->idx_mask[i];
 
 	if ((*mask & ((u_long)1 << v)) != 0)
 		return (1);
 
 	/* Mark as free */
 	*mask |= (u_long)1 << v;
 
 	/* Update free offset */
 	if (ni->free_off[0] > i)
 		ni->free_off[0] = i;
 	
 	return (0);
 }
 
 /*
  * Allocate new index in given instance and stores in in @pidx.
  * Returns 0 on success.
  */
 int
 ipfw_objhash_alloc_idx(void *n, uint16_t *pidx)
 {
 	struct namedobj_instance *ni;
 	u_long *mask;
 	int i, off, v;
 
 	ni = (struct namedobj_instance *)n;
 
 	off = ni->free_off[0];
 	mask = &ni->idx_mask[off];
 
 	for (i = off; i < ni->max_blocks; i++, mask++) {
 		if ((v = ffsl(*mask)) == 0)
 			continue;
 
 		/* Mark as busy */
 		*mask &= ~ ((u_long)1 << (v - 1));
 
 		ni->free_off[0] = i;
 		
 		v = BLOCK_ITEMS * i + v - 1;
 
 		*pidx = v;
 		return (0);
 	}
 
 	return (1);
 }
 
 /* end of file */