Index: projects/clang900-import/Makefile.inc1
===================================================================
--- projects/clang900-import/Makefile.inc1	(revision 352586)
+++ projects/clang900-import/Makefile.inc1	(revision 352587)
@@ -1,3400 +1,3401 @@
 #
 # $FreeBSD$
 #
 # Make command line options:
 #	-DNO_CLEANDIR run ${MAKE} clean, instead of ${MAKE} cleandir
 #	-DNO_CLEAN do not clean at all
 #	-DDB_FROM_SRC use the user/group databases in src/etc instead of
 #	    the system database when installing.
 #	-DNO_SHARE do not go into share subdir
 #	-DKERNFAST define NO_KERNEL{CONFIG,CLEAN,OBJ}
 #	-DNO_KERNELCONFIG do not run config in ${MAKE} buildkernel
 #	-DNO_KERNELCLEAN do not run ${MAKE} clean in ${MAKE} buildkernel
 #	-DNO_KERNELOBJ do not run ${MAKE} obj in ${MAKE} buildkernel
 #	-DNO_PORTSUPDATE do not update ports in ${MAKE} update
 #	-DNO_ROOT install without using root privilege
 #	-DNO_DOCUPDATE do not update doc in ${MAKE} update
 #	-DWITHOUT_CTF do not run the DTrace CTF conversion tools on built objects
 #	LOCAL_DIRS="list of dirs" to add additional dirs to the SUBDIR list
 #	LOCAL_ITOOLS="list of tools" to add additional tools to the ITOOLS list
 #	LOCAL_LIB_DIRS="list of dirs" to add additional dirs to libraries target
 #	LOCAL_MTREE="list of mtree files" to process to allow local directories
 #	    to be created before files are installed
 #	LOCAL_TOOL_DIRS="list of dirs" to add additional dirs to the build-tools
 #	    list
 #	LOCAL_XTOOL_DIRS="list of dirs" to add additional dirs to the
 #	    cross-tools target
 #	METALOG="path to metadata log" to write permission and ownership
 #	    when NO_ROOT is set.  (default: ${DESTDIR}/METALOG)
 #	TARGET="machine" to crossbuild world for a different machine type
 #	TARGET_ARCH= may be required when a TARGET supports multiple endians
 #	BUILDENV_SHELL= shell to launch for the buildenv target (def:${SHELL})
 #	WORLD_FLAGS= additional flags to pass to make(1) during buildworld
 #	KERNEL_FLAGS= additional flags to pass to make(1) during buildkernel
 #	SUBDIR_OVERRIDE="list of dirs" to build rather than everything.
 #	    All libraries and includes, and some build tools will still build.
 
 #
 # The intended user-driven targets are:
 # buildworld  - rebuild *everything*, including glue to help do upgrades
 # installworld- install everything built by "buildworld"
 # checkworld  - run test suite on installed world
 # doxygen     - build API documentation of the kernel
 # update      - convenient way to update your source tree (eg: svn/svnup)
 #
 # Standard targets (not defined here) are documented in the makefiles in
 # /usr/share/mk.  These include:
 #		obj depend all install clean cleandepend cleanobj
 
 .if !defined(TARGET) || !defined(TARGET_ARCH)
 .error "Both TARGET and TARGET_ARCH must be defined."
 .endif
 
 .if make(showconfig) || make(test-system-*)
 _MKSHOWCONFIG=	t
 .endif
 
 SRCDIR?=	${.CURDIR}
 LOCALBASE?=	/usr/local
 
 # Cross toolchain changes must be in effect before bsd.compiler.mk
 # so that gets the right CC, and pass CROSS_TOOLCHAIN to submakes.
 .if defined(CROSS_TOOLCHAIN)
 .if exists(${LOCALBASE}/share/toolchains/${CROSS_TOOLCHAIN}.mk)
 .include "${LOCALBASE}/share/toolchains/${CROSS_TOOLCHAIN}.mk"
 .elif exists(${CROSS_TOOLCHAIN})
 .include "${CROSS_TOOLCHAIN}"
 .else
 .error CROSS_TOOLCHAIN ${CROSS_TOOLCHAIN} not found
 .endif
 CROSSENV+=CROSS_TOOLCHAIN="${CROSS_TOOLCHAIN}"
 .endif
 .if defined(CROSS_TOOLCHAIN_PREFIX)
 CROSS_COMPILER_PREFIX?=${CROSS_TOOLCHAIN_PREFIX}
 .endif
 
 XCOMPILERS=	CC CXX CPP
 .for COMPILER in ${XCOMPILERS}
 .if defined(CROSS_COMPILER_PREFIX)
 X${COMPILER}?=	${CROSS_COMPILER_PREFIX}${${COMPILER}}
 .else
 X${COMPILER}?=	${${COMPILER}}
 .endif
 .endfor
 # If a full path to an external cross compiler is given, don't build
 # a cross compiler.
 .if ${XCC:N${CCACHE_BIN}:M/*}
 MK_CLANG_BOOTSTRAP=	no
 MK_GCC_BOOTSTRAP=	no
 .endif
 
 # Pull in compiler metadata from buildworld/toolchain if possible to avoid
 # running CC from bsd.compiler.mk.
 .if make(installworld) || make(install) || make(distributeworld) || \
     make(stageworld)
 .-include "${OBJTOP}/toolchain-metadata.mk"
 .if !defined(_LOADED_TOOLCHAIN_METADATA)
 .error A build is required first.  You may have the wrong MAKEOBJDIRPREFIX set.
 .endif
 .endif
 
 # Pull in COMPILER_TYPE and COMPILER_FREEBSD_VERSION early. Pull it from the
 # tree to be friendlier to foreign OS builds. It's safe to do so unconditionally
 # here since we will always have the right make, unlike in src/Makefile
 # Don't include bsd.linker.mk yet until XBINUTILS is handled (after src.opts.mk)
 _NO_INCLUDE_LINKERMK=	t
 # We also want the X_COMPILER* variables if we are using an external toolchain.
 _WANT_TOOLCHAIN_CROSS_VARS=	t
 .include "share/mk/bsd.compiler.mk"
 .undef _NO_INCLUDE_LINKERMK
 .undef _WANT_TOOLCHAIN_CROSS_VARS
 # src.opts.mk depends on COMPILER_FEATURES
 .include "share/mk/src.opts.mk"
 
 .if ${TARGET} == ${MACHINE}
 TARGET_CPUTYPE?=${CPUTYPE}
 .else
 TARGET_CPUTYPE?=
 .endif
 .if !empty(TARGET_CPUTYPE)
 _TARGET_CPUTYPE=${TARGET_CPUTYPE}
 .else
 _TARGET_CPUTYPE=dummy
 .endif
 .if ${TARGET} == "arm"
 .if ${TARGET_ARCH:Marmv[67]*} != "" && ${TARGET_CPUTYPE:M*soft*} == ""
 TARGET_ABI=	gnueabihf
 .else
 TARGET_ABI=	gnueabi
 .endif
 .endif
 MACHINE_ABI?=	unknown
 MACHINE_TRIPLE?=${MACHINE_ARCH:S/amd64/x86_64/:C/hf$//:S/mipsn32/mips64/}-${MACHINE_ABI}-freebsd13.0
 TARGET_ABI?=	unknown
 TARGET_TRIPLE?=	${TARGET_ARCH:S/amd64/x86_64/:C/hf$//:S/mipsn32/mips64/}-${TARGET_ABI}-freebsd13.0
 KNOWN_ARCHES?=	aarch64/arm64 \
 		amd64 \
 		arm \
 		armv6/arm \
 		armv7/arm \
 		i386 \
 		mips \
 		mipsel/mips \
 		mips64el/mips \
 		mipsn32el/mips \
 		mips64/mips \
 		mipsn32/mips \
 		mipshf/mips \
 		mipselhf/mips \
 		mips64elhf/mips \
 		mips64hf/mips \
 		powerpc \
 		powerpc64/powerpc \
 		powerpcspe/powerpc \
 		riscv64/riscv \
 		riscv64sf/riscv \
 		sparc64
 
 .if ${TARGET} == ${TARGET_ARCH}
 _t=		${TARGET}
 .else
 _t=		${TARGET_ARCH}/${TARGET}
 .endif
 .for _t in ${_t}
 .if empty(KNOWN_ARCHES:M${_t})
 .error Unknown target ${TARGET_ARCH}:${TARGET}.
 .endif
 .endfor
 
 # If all targets are disabled for system llvm then don't expect it to work
 # for cross-builds.
 .if !defined(TOOLS_PREFIX) && ${MK_LLVM_TARGET_ALL} == "no" && \
     ${MACHINE} != ${TARGET} && ${MACHINE_ARCH} != ${TARGET_ARCH} && \
     !make(showconfig)
 MK_SYSTEM_COMPILER=	no
 MK_SYSTEM_LINKER=	no
 .endif
 
 # Handle external binutils.
 .if defined(CROSS_TOOLCHAIN_PREFIX)
 CROSS_BINUTILS_PREFIX?=${CROSS_TOOLCHAIN_PREFIX}
 .endif
 # If we do not have a bootstrap binutils (because the in-tree one does not
 # support the target architecture), provide a default cross-binutils prefix.
 # This allows riscv64 builds, for example, to automatically use the
 # riscv64-binutils port or package.
 .if !make(showconfig) && !defined(_NO_INCLUDE_COMPILERMK)
 .if !empty(BROKEN_OPTIONS:MBINUTILS_BOOTSTRAP) && \
     ${MK_LLD_BOOTSTRAP} == "no" && \
     !defined(CROSS_BINUTILS_PREFIX)
 CROSS_BINUTILS_PREFIX=/usr/local/${TARGET_TRIPLE}/bin/
 .if !exists(${CROSS_BINUTILS_PREFIX})
 .error In-tree binutils does not support the ${TARGET_ARCH} architecture. Install the ${TARGET_ARCH}-binutils port or package or set CROSS_BINUTILS_PREFIX.
 .endif
 .endif
 .endif
 XBINUTILS=	AS AR LD NM OBJCOPY RANLIB SIZE STRINGS
 .for BINUTIL in ${XBINUTILS}
 .if defined(CROSS_BINUTILS_PREFIX) && \
     exists(${CROSS_BINUTILS_PREFIX}/${${BINUTIL}})
 X${BINUTIL}?=	${CROSS_BINUTILS_PREFIX:C,/*$,,}/${${BINUTIL}}
 .else
 X${BINUTIL}?=	${${BINUTIL}}
 .endif
 .endfor
 
 # If a full path to an external linker is given, don't build lld.
 .if ${XLD:M/*}
 MK_LLD_BOOTSTRAP=	no
 .endif
 
 # We also want the X_LINKER* variables if we are using an external toolchain.
 _WANT_TOOLCHAIN_CROSS_VARS=	t
 .include "share/mk/bsd.linker.mk"
 .undef _WANT_TOOLCHAIN_CROSS_VARS
 
 # Begin WITH_SYSTEM_COMPILER / WITH_SYSTEM_LD
 
 # WITH_SYSTEM_COMPILER - Pull in needed values and make a decision.
 
 # Check if there is a local compiler that can satisfy as an external compiler.
 # Which compiler is expected to be used?
 .if ${MK_CLANG_BOOTSTRAP} == "yes"
 WANT_COMPILER_TYPE=	clang
 .elif ${MK_GCC_BOOTSTRAP} == "yes"
 WANT_COMPILER_TYPE=	gcc
 .else
 WANT_COMPILER_TYPE=
 .endif
 
 .if !defined(WANT_COMPILER_FREEBSD_VERSION) && !make(showconfig) && \
     !make(test-system-linker)
 .if ${WANT_COMPILER_TYPE} == "clang"
 WANT_COMPILER_FREEBSD_VERSION_FILE= lib/clang/freebsd_cc_version.h
 WANT_COMPILER_FREEBSD_VERSION!= \
 	awk '$$2 == "FREEBSD_CC_VERSION" {printf("%d\n", $$3)}' \
 	${SRCDIR}/${WANT_COMPILER_FREEBSD_VERSION_FILE} || echo unknown
 WANT_COMPILER_VERSION_FILE= lib/clang/include/clang/Basic/Version.inc
 WANT_COMPILER_VERSION!= \
 	awk '$$2 == "CLANG_VERSION" {split($$3, a, "."); print a[1] * 10000 + a[2] * 100 + a[3]}' \
 	${SRCDIR}/${WANT_COMPILER_VERSION_FILE} || echo unknown
 .elif ${WANT_COMPILER_TYPE} == "gcc"
 WANT_COMPILER_FREEBSD_VERSION_FILE= gnu/usr.bin/cc/cc_tools/freebsd-native.h
 WANT_COMPILER_FREEBSD_VERSION!= \
 	awk '$$2 == "FBSD_CC_VER" {printf("%d\n", $$3)}' \
 	${SRCDIR}/${WANT_COMPILER_FREEBSD_VERSION_FILE} || echo unknown
 WANT_COMPILER_VERSION_FILE= contrib/gcc/BASE-VER
 WANT_COMPILER_VERSION!= \
 	awk -F. '{print $$1 * 10000 + $$2 * 100 + $$3}' \
 	${SRCDIR}/${WANT_COMPILER_VERSION_FILE} || echo unknown
 .endif
 .export WANT_COMPILER_FREEBSD_VERSION WANT_COMPILER_VERSION
 .endif	# !defined(WANT_COMPILER_FREEBSD_VERSION)
 
 # It needs to be the same revision as we would build for the bootstrap.
 # If the expected vs CC is different then we can't skip.
 # GCC cannot be used for cross-arch yet.  For clang we pass -target later if
 # TARGET_ARCH!=MACHINE_ARCH.
 .if ${MK_SYSTEM_COMPILER} == "yes" && \
     defined(WANT_COMPILER_FREEBSD_VERSION) && \
     (${MK_CLANG_BOOTSTRAP} == "yes" || ${MK_GCC_BOOTSTRAP} == "yes") && \
     !make(xdev*) && \
     ${X_COMPILER_TYPE} == ${WANT_COMPILER_TYPE} && \
     (${X_COMPILER_TYPE} == "clang" || ${TARGET_ARCH} == ${MACHINE_ARCH}) && \
     ${X_COMPILER_VERSION} == ${WANT_COMPILER_VERSION} && \
     ${X_COMPILER_FREEBSD_VERSION} == ${WANT_COMPILER_FREEBSD_VERSION}
 # Everything matches, disable the bootstrap compiler.
 MK_CLANG_BOOTSTRAP=	no
 MK_GCC_BOOTSTRAP=	no
 USING_SYSTEM_COMPILER=	yes
 .endif	# ${WANT_COMPILER_TYPE} == ${COMPILER_TYPE}
 
 # WITH_SYSTEM_LD - Pull in needed values and make a decision.
 
 # Check if there is a local linker that can satisfy as an external linker.
 # Which linker is expected to be used?
 .if ${MK_LLD_BOOTSTRAP} == "yes"
 WANT_LINKER_TYPE=		lld
 .elif ${MK_BINUTILS_BOOTSTRAP} == "yes"
 # Note that there's no support for bfd in WITH_SYSTEM_LINKER.
 WANT_LINKER_TYPE=	bfd
 .else
 WANT_LINKER_TYPE=
 .endif
 
 .if !defined(WANT_LINKER_FREEBSD_VERSION) && !make(showconfig) && \
     !make(test-system-compiler)
 .if ${WANT_LINKER_TYPE} == "lld"
 WANT_LINKER_FREEBSD_VERSION_FILE= lib/clang/include/lld/Common/Version.inc
 WANT_LINKER_FREEBSD_VERSION!= \
 	awk '$$2 == "LLD_REVISION_STRING" {gsub(/"/, "", $$3); print $$3}' \
 	${SRCDIR}/${WANT_LINKER_FREEBSD_VERSION_FILE} || echo unknown
 WANT_LINKER_VERSION_FILE= lib/clang/include/lld/Common/Version.inc
 WANT_LINKER_VERSION!= \
 	awk '$$2 == "LLD_VERSION" {split($$3, a, "."); print a[1] * 10000 + a[2] * 100 + a[3]}' \
 	${SRCDIR}/${WANT_LINKER_VERSION_FILE} || echo unknown
 .else
 WANT_LINKER_FREEBSD_VERSION_FILE=
 WANT_LINKER_FREEBSD_VERSION=
 .endif
 .export WANT_LINKER_FREEBSD_VERSION WANT_LINKER_VERSION
 .endif	# !defined(WANT_LINKER_FREEBSD_VERSION)
 
 .if ${MK_SYSTEM_LINKER} == "yes" && \
     defined(WANT_LINKER_FREEBSD_VERSION) && \
     (${MK_LLD_BOOTSTRAP} == "yes") && \
     !make(xdev*) && \
     ${X_LINKER_TYPE} == ${WANT_LINKER_TYPE} && \
     ${X_LINKER_VERSION} == ${WANT_LINKER_VERSION} && \
     ${X_LINKER_FREEBSD_VERSION} == ${WANT_LINKER_FREEBSD_VERSION}
 # Everything matches, disable the bootstrap linker.
 MK_LLD_BOOTSTRAP=	no
 USING_SYSTEM_LINKER=	yes
 .endif	# ${WANT_LINKER_TYPE} == ${LINKER_TYPE}
 
 # WITH_SYSTEM_COMPILER / WITH_SYSTEM_LINKER - Handle defaults and debug.
 USING_SYSTEM_COMPILER?=	no
 USING_SYSTEM_LINKER?=	no
 
 TEST_SYSTEM_COMPILER_VARS= \
 	USING_SYSTEM_COMPILER MK_SYSTEM_COMPILER \
 	MK_CROSS_COMPILER MK_CLANG_BOOTSTRAP MK_GCC_BOOTSTRAP \
 	WANT_COMPILER_TYPE WANT_COMPILER_VERSION WANT_COMPILER_VERSION_FILE \
 	WANT_COMPILER_FREEBSD_VERSION WANT_COMPILER_FREEBSD_VERSION_FILE \
 	CC COMPILER_TYPE COMPILER_FEATURES COMPILER_VERSION \
 	COMPILER_FREEBSD_VERSION \
 	XCC X_COMPILER_TYPE X_COMPILER_FEATURES X_COMPILER_VERSION \
 	X_COMPILER_FREEBSD_VERSION
 TEST_SYSTEM_LINKER_VARS= \
 	USING_SYSTEM_LINKER MK_SYSTEM_LINKER \
 	MK_LLD_BOOTSTRAP MK_BINUTILS_BOOTSTRAP \
 	WANT_LINKER_TYPE WANT_LINKER_VERSION WANT_LINKER_VERSION_FILE \
 	WANT_LINKER_FREEBSD_VERSION WANT_LINKER_FREEBSD_VERSION_FILE \
 	LD LINKER_TYPE LINKER_FEATURES LINKER_VERSION \
 	LINKER_FREEBSD_VERSION \
 	XLD X_LINKER_TYPE X_LINKER_FEATURES X_LINKER_VERSION \
 	X_LINKER_FREEBSD_VERSION
 
 .for _t in compiler linker
 test-system-${_t}: .PHONY
 .for v in ${TEST_SYSTEM_${_t:tu}_VARS}
 	${_+_}@printf "%-35s= %s\n" "${v}" "${${v}}"
 .endfor
 .endfor
 .if (make(buildworld) || make(buildkernel) || make(kernel-toolchain) || \
     make(toolchain) || make(_cross-tools))
 .if ${USING_SYSTEM_COMPILER} == "yes"
 .info SYSTEM_COMPILER: Determined that CC=${CC} matches the source tree.  Not bootstrapping a cross-compiler.
 .elif ${MK_CLANG_BOOTSTRAP} == "yes"
 .info SYSTEM_COMPILER: libclang will be built for bootstrapping a cross-compiler.
 .endif
 .if ${USING_SYSTEM_LINKER} == "yes"
 .info SYSTEM_LINKER: Determined that LD=${LD} matches the source tree.  Not bootstrapping a cross-linker.
 .elif ${MK_LLD_BOOTSTRAP} == "yes"
 .info SYSTEM_LINKER: libclang will be built for bootstrapping a cross-linker.
 .endif
 .endif
 
 # End WITH_SYSTEM_COMPILER / WITH_SYSTEM_LD
 
 # Store some compiler metadata for use in installworld where we don't
 # want to invoke CC at all.
 _TOOLCHAIN_METADATA_VARS=	COMPILER_VERSION \
 				COMPILER_TYPE \
 				COMPILER_FEATURES \
 				COMPILER_FREEBSD_VERSION \
 				LINKER_VERSION \
 				LINKER_FEATURES \
 				LINKER_TYPE \
 				LINKER_FREEBSD_VERSION
 toolchain-metadata.mk: .PHONY .META
 	@: > ${.TARGET}
 	@echo ".info Using cached toolchain metadata from build at $$(hostname) on $$(date)" \
 	    > ${.TARGET}
 	@echo "_LOADED_TOOLCHAIN_METADATA=t" >> ${.TARGET}
 .for v in ${_TOOLCHAIN_METADATA_VARS}
 	@echo "${v}=${${v}}" >> ${.TARGET}
 	@echo "X_${v}=${X_${v}}" >> ${.TARGET}
 .endfor
 	@echo ".export ${_TOOLCHAIN_METADATA_VARS}" >> ${.TARGET}
 	@echo ".export ${_TOOLCHAIN_METADATA_VARS:C,^,X_,}" >> ${.TARGET}
 
 
 # We must do lib/ and libexec/ before bin/ in case of a mid-install error to
 # keep the users system reasonably usable.  For static->dynamic root upgrades,
 # we don't want to install a dynamic binary without rtld and the needed
 # libraries.  More commonly, for dynamic root, we don't want to install a
 # binary that requires a newer library version that hasn't been installed yet.
 # This ordering is not a guarantee though.  The only guarantee of a working
 # system here would require fine-grained ordering of all components based
 # on their dependencies.
 .if !empty(SUBDIR_OVERRIDE)
 SUBDIR=	${SUBDIR_OVERRIDE}
 .else
 SUBDIR=	lib libexec
 # Add LOCAL_LIB_DIRS, but only if they will not be picked up as a SUBDIR
 # of a LOCAL_DIRS directory.  This allows LOCAL_DIRS=foo and
 # LOCAL_LIB_DIRS=foo/lib to behave as expected.
 .for _DIR in ${LOCAL_DIRS:M*/} ${LOCAL_DIRS:N*/:S|$|/|}
 _REDUNDANT_LIB_DIRS+=    ${LOCAL_LIB_DIRS:M${_DIR}*}
 .endfor
 .for _DIR in ${LOCAL_LIB_DIRS}
 .if ${_DIR} == ".WAIT" || (empty(_REDUNDANT_LIB_DIRS:M${_DIR}) && exists(${.CURDIR}/${_DIR}/Makefile))
 SUBDIR+=	${_DIR}
 .endif
 .endfor
 .if !defined(NO_ROOT) && (make(installworld) || make(install))
 # Ensure libraries are installed before progressing.
 SUBDIR+=.WAIT
 .endif
 SUBDIR+=bin
 .if ${MK_CDDL} != "no"
 SUBDIR+=cddl
 .endif
 SUBDIR+=gnu include
 .if ${MK_KERBEROS} != "no"
 SUBDIR+=kerberos5
 .endif
 .if ${MK_RESCUE} != "no"
 SUBDIR+=rescue
 .endif
 SUBDIR+=sbin
 .if ${MK_CRYPT} != "no"
 SUBDIR+=secure
 .endif
 .if !defined(NO_SHARE)
 SUBDIR+=share
 .endif
 .if ${MK_BOOT} != "no"
 SUBDIR+=stand
 .endif
 SUBDIR+=sys usr.bin usr.sbin
 .if ${MK_TESTS} != "no"
 SUBDIR+=	tests
 .endif
 
 # Local directories are built in parallel with the base system directories.
 # Users may insert a .WAIT directive at the beginning or elsewhere within
 # the LOCAL_DIRS and LOCAL_LIB_DIRS lists as needed.
 .for _DIR in ${LOCAL_DIRS}
 .if ${_DIR} == ".WAIT" || exists(${.CURDIR}/${_DIR}/Makefile)
 SUBDIR+=	${_DIR}
 .endif
 .endfor
 
 # We must do etc/ last as it hooks into building the man whatis file
 # by calling 'makedb' in share/man.  This is only relevant for
 # install/distribute so they build the whatis file after every manpage is
 # installed.
 .if make(installworld) || make(install)
 SUBDIR+=.WAIT
 .endif
 SUBDIR+=etc
 
 .endif	# !empty(SUBDIR_OVERRIDE)
 
 .if defined(NOCLEAN)
 .warning NOCLEAN option is deprecated. Use NO_CLEAN instead.
 NO_CLEAN=	${NOCLEAN}
 .endif
 .if defined(NO_CLEANDIR)
 CLEANDIR=	clean cleandepend
 .else
 CLEANDIR=	cleandir
 .endif
 
 .if defined(WORLDFAST)
 NO_CLEAN=	t
 NO_OBJWALK=	t
 .endif
 
 .if ${MK_META_MODE} == "yes"
 # If filemon is used then we can rely on the build being incremental-safe.
 # The .meta files will also track the build command and rebuild should
 # it change.
 .if empty(.MAKE.MODE:Mnofilemon)
 NO_CLEAN=	t
 .endif
 .endif
 .if defined(NO_OBJWALK) || ${MK_AUTO_OBJ} == "yes"
 NO_OBJWALK=	t
 NO_KERNELOBJ=	t
 .endif
 .if !defined(NO_OBJWALK)
 _obj=		obj
 .endif
 
 LOCAL_TOOL_DIRS?=
 PACKAGEDIR?=	${DESTDIR}/${DISTDIR}
 
 .if empty(SHELL:M*csh*)
 BUILDENV_SHELL?=${SHELL}
 .else
 BUILDENV_SHELL?=/bin/sh
 .endif
 
 .if !defined(_MKSHOWCONFIG)
 .if !defined(SVN_CMD) || empty(SVN_CMD)
 . for _P in /usr/bin /usr/local/bin
 .  for _S in svn svnlite
 .   if exists(${_P}/${_S})
 SVN_CMD=   ${_P}/${_S}
 .   endif
 .  endfor
 . endfor
 .export SVN_CMD
 .endif
 SVNFLAGS?=	-r HEAD
 .if !defined(VCS_REVISION) || empty(VCS_REVISION)
 .if !defined(SVNVERSION_CMD) || empty(SVNVERSION_CMD)
 . for _D in ${PATH:S,:, ,g}
 .  if exists(${_D}/svnversion)
 SVNVERSION_CMD?=${_D}/svnversion
 .  endif
 .  if exists(${_D}/svnliteversion)
 SVNVERSION_CMD?=${_D}/svnliteversion
 .  endif
 . endfor
 .endif
 _VCS_REVISION?=	$$(eval ${SVNVERSION_CMD} ${SRCDIR})
 . if !empty(_VCS_REVISION)
 VCS_REVISION=	$$(echo r${_VCS_REVISION})
 . endif
 .export VCS_REVISION
 .endif
 
 .if !defined(OSRELDATE)
 .if exists(/usr/include/osreldate.h)
 OSRELDATE!=	awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \
 		/usr/include/osreldate.h
 .else
 OSRELDATE=	0
 .endif
 .export OSRELDATE
 .endif
 
 # Set VERSION for CTFMERGE to use via the default CTFFLAGS=-L VERSION.
 .if !defined(_REVISION)
 _REVISION!=	${MAKE} -C ${SRCDIR}/release MK_AUTO_OBJ=no -V REVISION
 .export _REVISION
 .endif
 .if !defined(_BRANCH)
 _BRANCH!=	${MAKE} -C ${SRCDIR}/release MK_AUTO_OBJ=no -V BRANCH
 .export _BRANCH
 .endif
 .if !defined(SRCRELDATE)
 SRCRELDATE!=	awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \
 		${SRCDIR}/sys/sys/param.h
 .export SRCRELDATE
 .endif
 .if !defined(VERSION)
 VERSION=	FreeBSD ${_REVISION}-${_BRANCH:C/-p[0-9]+$//} ${TARGET_ARCH} ${SRCRELDATE}
 .export VERSION
 .endif
 
 .if !defined(PKG_VERSION)
 .if ${_BRANCH:MSTABLE*} || ${_BRANCH:MCURRENT*}
 TIMENOW=	%Y%m%d%H%M%S
 EXTRA_REVISION=	.s${TIMENOW:gmtime}
 .elif ${_BRANCH:MALPHA*}
 EXTRA_REVISION= _${_BRANCH:C/-ALPHA/.a/}
 .elif ${_BRANCH:MBETA*}
 EXTRA_REVISION= _${_BRANCH:C/-BETA/.b/}
 .elif ${_BRANCH:MRC*}
 EXTRA_REVISION= _${_BRANCH:C/-RC/.r/}
 .elif ${_BRANCH:MPRERELEASE*}
 EXTRA_REVISION= _${_BRANCH:C/-PRERELEASE/.p/}
 .elif ${_BRANCH:M*-p*}
 EXTRA_REVISION=	_${_BRANCH:C/.*-p([0-9]+$)/\1/}
 .endif
 PKG_VERSION=	${_REVISION}${EXTRA_REVISION}
 .endif
 .endif	# !defined(PKG_VERSION)
 
 .if !defined(_MKSHOWCONFIG)
 _CPUTYPE!=	MAKEFLAGS= CPUTYPE=${_TARGET_CPUTYPE} ${MAKE} -f /dev/null \
 		-m ${.CURDIR}/share/mk MK_AUTO_OBJ=no -V CPUTYPE
 .if ${_CPUTYPE} != ${_TARGET_CPUTYPE}
 .error CPUTYPE global should be set with ?=.
 .endif
 .endif
 .if make(buildworld)
 BUILD_ARCH!=	uname -p
 .if ${MACHINE_ARCH} != ${BUILD_ARCH}
 .error To cross-build, set TARGET_ARCH.
 .endif
 .endif
 WORLDTMP?=	${OBJTOP}/tmp
 BPATH=		${CCACHE_WRAPPER_PATH_PFX}${WORLDTMP}/legacy/usr/sbin:${WORLDTMP}/legacy/usr/bin:${WORLDTMP}/legacy/bin
 XPATH=		${WORLDTMP}/usr/sbin:${WORLDTMP}/usr/bin
 
 # When building we want to find the cross tools before the host tools in ${BPATH}.
 # We also need to add UNIVERSE_TOOLCHAIN_PATH so that we can find the shared
 # toolchain files (clang, lld, etc.) during make universe/tinderbox
 STRICTTMPPATH=	${XPATH}:${BPATH}:${UNIVERSE_TOOLCHAIN_PATH}
 # We should not be using tools from /usr/bin accidentally since this could cause
 # the build to break on other systems that don't have that tool. For now we
 # still allow using the old behaviour (inheriting $PATH) if
 # BUILD_WITH_STRICT_TMPPATH is set to 0 but this will eventually be removed.
 
 # Currently strict $PATH can cause build failures and does not work yet with
 # USING_SYSTEM_LINKER/USING_SYSTEM_COMPILER. Once these issues have been
 # resolved it will be turned on by default.
 BUILD_WITH_STRICT_TMPPATH?=0
 .if ${BUILD_WITH_STRICT_TMPPATH} != 0
 TMPPATH=	${STRICTTMPPATH}
 .else
 TMPPATH=	${STRICTTMPPATH}:${PATH}
 .endif
 
 #
 # Avoid running mktemp(1) unless actually needed.
 # It may not be functional, e.g., due to new ABI
 # when in the middle of installing over this system.
 #
 .if make(distributeworld) || make(installworld) || make(stageworld)
 .if ${BUILD_WITH_STRICT_TMPPATH} != 0
 MKTEMP=${WORLDTMP}/legacy/usr/bin/mktemp
 .if !exists(${MKTEMP})
 .error "mktemp binary doesn't exist in expected location: ${MKTEMP}"
 .endif
 .else
 MKTEMP=mktemp
 .endif
 INSTALLTMP!=	${MKTEMP} -d -u -t install
 .endif
 
 .if make(stagekernel) || make(distributekernel)
 TAGS+=		kernel
 PACKAGE=	kernel
 .endif
 
 #
 # Building a world goes through the following stages
 #
 # 1. legacy stage [BMAKE]
 #	This stage is responsible for creating compatibility
 #	shims that are needed by the bootstrap-tools,
 #	build-tools and cross-tools stages. These are generally
 #	APIs that tools from one of those three stages need to
 #	build that aren't present on the host.
 # 1. bootstrap-tools stage [BMAKE]
 #	This stage is responsible for creating programs that
 #	are needed for backward compatibility reasons. They
 #	are not built as cross-tools.
 # 2. build-tools stage [TMAKE]
 #	This stage is responsible for creating the object
 #	tree and building any tools that are needed during
 #	the build process. Some programs are listed during
 #	this phase because they build binaries to generate
 #	files needed to build these programs. This stage also
 #	builds the 'build-tools' target rather than 'all'.
 # 3. cross-tools stage [XMAKE]
 #	This stage is responsible for creating any tools that
 #	are needed for building the system. A cross-compiler is one
 #	of them. This differs from build tools in two ways:
 #	1. the 'all' target is built rather than 'build-tools'
 #	2. these tools are installed into TMPPATH for stage 4.
 # 4. world stage [WMAKE]
 #	This stage actually builds the world.
 # 5. install stage (optional) [IMAKE]
 #	This stage installs a previously built world.
 #
 
 BOOTSTRAPPING?=	0
 # Keep these in sync
 MINIMUM_SUPPORTED_OSREL?= 1002501
 MINIMUM_SUPPORTED_REL?= 10.3
 
 # Common environment for world related stages
 CROSSENV+=	\
 		MACHINE_ARCH=${TARGET_ARCH} \
 		MACHINE=${TARGET} \
 		CPUTYPE=${TARGET_CPUTYPE}
 .if ${MK_META_MODE} != "no"
 # Don't rebuild build-tools targets during normal build.
 CROSSENV+=	BUILD_TOOLS_META=.NOMETA
 .endif
 .if defined(TARGET_CFLAGS)
 CROSSENV+=	${TARGET_CFLAGS}
 .endif
 
 BOOTSTRAPPING_OSRELDATE?=${OSRELDATE}
 
 # bootstrap-tools stage
 BMAKEENV=	INSTALL="sh ${.CURDIR}/tools/install.sh" \
 		TOOLS_PREFIX=${TOOLS_PREFIX_UNDEF:U${WORLDTMP}} \
 		PATH=${BPATH}:${PATH} \
 		WORLDTMP=${WORLDTMP} \
 		MAKEFLAGS="-m ${.CURDIR}/tools/build/mk ${.MAKEFLAGS}"
 # need to keep this in sync with targets/pseudo/bootstrap-tools/Makefile
 BSARGS= 	DESTDIR= \
 		OBJTOP='${WORLDTMP}/obj-tools' \
 		OBJROOT='$${OBJTOP}/' \
 		MAKEOBJDIRPREFIX= \
 		BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \
 		BWPHASE=${.TARGET:C,^_,,} \
 		SSP_CFLAGS= \
 		MK_HTML=no NO_LINT=yes MK_MAN=no \
 		-DNO_PIC MK_PROFILE=no -DNO_SHARED \
 		-DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \
 		MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \
 		MK_LLDB=no MK_RETPOLINE=no MK_TESTS=no \
 		MK_INCLUDES=yes
 
 BMAKE=		\
 		${BMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \
 		${BSARGS}
 .if empty(.MAKEOVERRIDES:MMK_LLVM_TARGET_ALL)
 BMAKE+=		MK_LLVM_TARGET_ALL=no
 .endif
 
 # build-tools stage
 TMAKE=		\
 		${BMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \
 		TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \
 		DESTDIR= \
 		BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \
 		BWPHASE=${.TARGET:C,^_,,} \
 		SSP_CFLAGS= \
 		-DNO_LINT \
 		-DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \
 		MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \
 		MK_LLDB=no MK_RETPOLINE=no MK_TESTS=no
 
 # cross-tools stage
 # TOOLS_PREFIX set in BMAKE
 XMAKE=		${BMAKE} \
 		TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \
 		MK_GDB=no MK_TESTS=no
 
 # kernel-tools stage
 KTMAKEENV=	INSTALL="sh ${.CURDIR}/tools/install.sh" \
 		PATH=${BPATH}:${PATH} \
 		WORLDTMP=${WORLDTMP}
 KTMAKE=		\
 		TOOLS_PREFIX=${TOOLS_PREFIX_UNDEF:U${WORLDTMP}} \
 		${KTMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \
 		DESTDIR= \
 		OBJTOP='${WORLDTMP}/obj-kernel-tools' \
 		OBJROOT='$${OBJTOP}/' \
 		MAKEOBJDIRPREFIX= \
 		BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \
 		SSP_CFLAGS= \
 		MK_HTML=no -DNO_LINT MK_MAN=no \
 		-DNO_PIC MK_PROFILE=no -DNO_SHARED \
 		-DNO_CPU_CFLAGS MK_RETPOLINE=no MK_WARNS=no MK_CTF=no
 
 # world stage
 WMAKEENV=	${CROSSENV} \
 		INSTALL="sh ${.CURDIR}/tools/install.sh" \
 		PATH=${TMPPATH} \
 		SYSROOT=${WORLDTMP}
 
 # make hierarchy
 HMAKE=		PATH=${TMPPATH} ${MAKE} LOCAL_MTREE=${LOCAL_MTREE:Q}
 .if defined(NO_ROOT)
 HMAKE+=		PATH=${TMPPATH} METALOG=${METALOG} -DNO_ROOT
 .endif
 
 CROSSENV+=	CC="${XCC} ${XCFLAGS}" CXX="${XCXX} ${XCXXFLAGS} ${XCFLAGS}" \
 		CPP="${XCPP} ${XCFLAGS}" \
 		AS="${XAS}" AR="${XAR}" LD="${XLD}" LLVM_LINK="${XLLVM_LINK}" \
 		NM=${XNM} OBJCOPY="${XOBJCOPY}" \
 		RANLIB=${XRANLIB} STRINGS=${XSTRINGS} \
 		SIZE="${XSIZE}"
 
 .if defined(CROSS_BINUTILS_PREFIX) && exists(${CROSS_BINUTILS_PREFIX})
 # In the case of xdev-build tools, CROSS_BINUTILS_PREFIX won't be a
 # directory, but the compiler will look in the right place for its
 # tools so we don't need to tell it where to look.
 BFLAGS+=	-B${CROSS_BINUTILS_PREFIX}
 .endif
 
 
 # The internal bootstrap compiler has a default sysroot set by TOOLS_PREFIX
 # and target set by TARGET/TARGET_ARCH.  However, there are several needs to
 # always pass an explicit --sysroot and -target.
 # - External compiler needs sysroot and target flags.
 # - External ld needs sysroot.
 # - To be clear about the use of a sysroot when using the internal compiler.
 # - Easier debugging.
 # - Allowing WITH_SYSTEM_COMPILER+WITH_META_MODE to work together due to
 #   the flip-flopping build command when sometimes using external and
 #   sometimes using internal.
 # - Allow using lld which has no support for default paths.
 .if !defined(CROSS_BINUTILS_PREFIX) || !exists(${CROSS_BINUTILS_PREFIX})
 BFLAGS+=	-B${WORLDTMP}/usr/bin
 .endif
 .if ${WANT_COMPILER_TYPE} == gcc || \
     (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == gcc)
 .elif ${WANT_COMPILER_TYPE} == clang || \
     (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == clang)
 XCFLAGS+=	-target ${TARGET_TRIPLE}
 .endif
 XCFLAGS+=	--sysroot=${WORLDTMP}
 
 .if !empty(BFLAGS)
 XCFLAGS+=	${BFLAGS}
 .endif
 
 .if ${MK_LIB32} != "no" && (${TARGET_ARCH} == "amd64" || \
     ${TARGET_ARCH} == "powerpc64" || ${TARGET_ARCH:Mmips64*} != "")
 LIBCOMPAT= 32
 .include "Makefile.libcompat"
 .elif ${MK_LIBSOFT} != "no" && ${TARGET_ARCH:Marmv[67]*} != ""
 LIBCOMPAT= SOFT
 .include "Makefile.libcompat"
 .endif
 
 # META_MODE normally ignores host file changes since every build updates
 # timestamps (see NO_META_IGNORE_HOST in sys.mk).  There are known times
 # when the ABI breaks though that we want to force rebuilding WORLDTMP
 # to get updated host tools.
 .if ${MK_META_MODE} == "yes" && defined(NO_CLEAN) && \
     !defined(NO_META_IGNORE_HOST) && !defined(NO_META_IGNORE_HOST_HEADERS) && \
     !defined(_MKSHOWCONFIG)
 # r318736 - ino64 major ABI breakage
 META_MODE_BAD_ABI_VERS+=	1200031
 
 .if !defined(OBJDIR_HOST_OSRELDATE)
 .if exists(${OBJTOP}/host-osreldate.h)
 OBJDIR_HOST_OSRELDATE!=	\
     awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \
     ${OBJTOP}/host-osreldate.h
 .elif exists(${WORLDTMP}/usr/include/osreldate.h)
 OBJDIR_HOST_OSRELDATE=	0
 .endif
 .export OBJDIR_HOST_OSRELDATE
 .endif
 
 # Note that this logic is the opposite of normal BOOTSTRAP handling.  We want
 # to compare the WORLDTMP's OSRELDATE to the host's OSRELDATE.  If the WORLDTMP
 # is older than the ABI-breakage OSRELDATE of the HOST then we rebuild.
 .if defined(OBJDIR_HOST_OSRELDATE)
 .for _ver in ${META_MODE_BAD_ABI_VERS}
 .if ${OSRELDATE} >= ${_ver} && ${OBJDIR_HOST_OSRELDATE} < ${_ver}
 _meta_mode_need_rebuild=	${_ver}
 .endif
 .endfor
 .if defined(_meta_mode_need_rebuild)
 .info META_MODE: Rebuilding host tools due to ABI breakage in __FreeBSD_version ${_meta_mode_need_rebuild}.
 NO_META_IGNORE_HOST_HEADERS=	1
 .export NO_META_IGNORE_HOST_HEADERS
 .endif	# defined(_meta_mode_need_rebuild)
 .endif	# defined(OBJDIR_HOST_OSRELDATE)
 .endif	# ${MK_META_MODE} == "yes" && defined(NO_CLEAN) ...
 # This is only used for META_MODE+filemon to track what the oldest
 # __FreeBSD_version is in WORLDTMP.  This purposely does NOT have
 # a make dependency on /usr/include/osreldate.h as the file should
 # only be copied when it is missing or meta mode determines it has changed.
 # Since host files are normally ignored without NO_META_IGNORE_HOST
 # the file will never be updated unless that flag is specified.  This
 # allows tracking the oldest osreldate to force rebuilds via
 # META_MODE_BADABI_REVS above.
 host-osreldate.h: # DO NOT ADD /usr/include/osreldate.h here
 	@cp -f /usr/include/osreldate.h ${.TARGET}
 
 WMAKE=		${WMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \
 		BWPHASE=${.TARGET:C,^_,,} \
 		DESTDIR=${WORLDTMP}
 
 IMAKEENV=	${CROSSENV}
 IMAKE=		${IMAKEENV} ${MAKE} -f Makefile.inc1 \
 		${IMAKE_INSTALL} ${IMAKE_MTREE}
 .if empty(.MAKEFLAGS:M-n)
 IMAKEENV+=	PATH=${STRICTTMPPATH}:${INSTALLTMP} \
 		LD_LIBRARY_PATH=${INSTALLTMP} \
 		PATH_LOCALE=${INSTALLTMP}/locale
 IMAKE+=		__MAKE_SHELL=${INSTALLTMP}/sh
 .else
 IMAKEENV+=	PATH=${TMPPATH}:${INSTALLTMP}
 .endif
 
 # When generating install media, do not allow user and group information from
 # the build host to affect the contents of the distribution.
 .if make(distributeworld) || make(distrib-dirs) || make(distribution)
 DB_FROM_SRC=	yes
 .endif
 
 .if defined(DB_FROM_SRC)
 INSTALLFLAGS+=	-N ${.CURDIR}/etc
 MTREEFLAGS+=	-N ${.CURDIR}/etc
 .endif
 _INSTALL_DDIR=	${DESTDIR}/${DISTDIR}
 INSTALL_DDIR=	${_INSTALL_DDIR:S://:/:g:C:/$::}
 .if defined(NO_ROOT)
 METALOG?=	${DESTDIR}/${DISTDIR}/METALOG
 METALOG:=	${METALOG:C,//+,/,g}
 IMAKE+=		-DNO_ROOT METALOG=${METALOG}
 INSTALLFLAGS+=	-U -M ${METALOG} -D ${INSTALL_DDIR}
 MTREEFLAGS+=	-W
 .endif
 .if defined(BUILD_PKGS)
 INSTALLFLAGS+=	-h sha256
 .endif
 .if defined(DB_FROM_SRC) || defined(NO_ROOT)
 IMAKE_INSTALL=	INSTALL="install ${INSTALLFLAGS}"
 IMAKE_MTREE=	MTREE_CMD="mtree ${MTREEFLAGS}"
 .endif
 
 DESTDIR_MTREEFLAGS=	-deU
 # When creating worldtmp we don't need to set the directories as owned by root
 # so we also pass -W
 WORLDTMP_MTREEFLAGS=	-deUW
 .if defined(NO_ROOT)
 # When building with -DNO_ROOT we shouldn't be changing the directories
 # that are created by mtree to be owned by root/wheel.
 DESTDIR_MTREEFLAGS+=	-W
 .endif
 MTREE?=	mtree
 .if ${BUILD_WITH_STRICT_TMPPATH} != 0
 MTREE=	${WORLDTMP}/legacy/usr/sbin/mtree
 .endif
 WORLDTMP_MTREE=	${MTREE} ${WORLDTMP_MTREEFLAGS}
 DESTDIR_MTREE=	${MTREE} ${DESTDIR_MTREEFLAGS}
 
 # kernel stage
 KMAKEENV=	${WMAKEENV:NSYSROOT=*}
 KMAKE=		${KMAKEENV} ${MAKE} ${.MAKEFLAGS} ${KERNEL_FLAGS} KERNEL=${INSTKERNNAME}
 
 #
 # buildworld
 #
 # Attempt to rebuild the entire system, with reasonable chance of
 # success, regardless of how old your existing system is.
 #
 _sanity_check: .PHONY .MAKE
 .if ${.CURDIR:C/[^,]//g} != ""
 #	The m4 build of sendmail files doesn't like it if ',' is used
 #	anywhere in the path of it's files.
 	@echo
 	@echo "*** Error: path to source tree contains a comma ','"
 	@echo
 	@false
 .elif ${.CURDIR:M*\:*} != ""
 #	Using ':' leaks into PATH and breaks finding cross-tools.
 	@echo
 	@echo "*** Error: path to source tree contains a colon ':'"
 	@echo
 	@false
 .endif
 
 # Our current approach to dependency tracking cannot cope with certain source
 # tree changes, particularly with respect to removing source files and
 # replacing generated files.  Handle these cases here in an ad-hoc fashion.
 _cleanobj_fast_depend_hack: .PHONY
 # Syscall stubs rewritten in C and obsolete MD assembly implementations
 # Date      SVN Rev  Syscalls
 # 20180604  r334626  brk sbrk
 .for f in brk sbrk
 	@if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \
 	    egrep -qw '${f}\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \
 		echo "Removing stale dependencies for ${f} syscall wrappers"; \
 		rm -f ${OBJTOP}/lib/libc/.depend.${f}.* \
 		   ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.${f}.*}; \
 	fi
 .endfor
 # 20181013  r339348  bcopy reimplemented as .c
 .for f in bcopy memcpy memmove
 	@if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \
 	    egrep -qw 'bcopy\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \
 		echo "Removing stale dependencies for bcopy"; \
 		rm -f ${OBJTOP}/lib/libc/.depend.${f}.* \
 		   ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.${f}.*}; \
 	fi
 .endfor
 # 20181115  r340463  bzero reimplemented as .c
 	@if [ -e "${OBJTOP}/lib/libc/.depend.bzero.o" ] && \
 	    egrep -qw 'bzero\.[sS]' ${OBJTOP}/lib/libc/.depend.bzero.o; then \
 		echo "Removing stale dependencies for bzero"; \
 		rm -f ${OBJTOP}/lib/libc/.depend.bzero.* \
 		   ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.bzero.*}; \
 	fi
 # 20181009 track migration from ntp's embedded libevent to updated one
 	@if [ -e "${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.bufferevent_openssl.o" ] && \
 	    egrep -q 'contrib/ntp/sntp/libevent/bufferevent_openssl.c' \
 	    ${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.bufferevent_openssl.o ; then \
 		echo "Removing stale libevent dependencies"; \
 		rm -f ${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.*; \
 	fi
 
 # 20181209  r341759 track migration across wpa update
 	@if [ -e "${OBJTOP}/usr.sbin/wpa/wpa_supplicant/.depend.rrm.o" ] && \
 	    egrep -q 'src/ap/rrm.c' \
 	    ${OBJTOP}/usr.sbin/wpa/wpa_supplicant/.depend.rrm.o; then \
 		echo "Removing stale wpa dependencies"; \
 		rm -f ${OBJTOP}/usr.sbin/wpa/*/.depend*; \
 	fi
 
 _worldtmp: .PHONY
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Rebuilding the temporary build tree"
 	@echo "--------------------------------------------------------------"
 .if !defined(NO_CLEAN)
 	rm -rf ${WORLDTMP}
 .else
 # Note: for delete-old we need to set $PATH to also include the host $PATH
 # since otherwise a partial build with missing symlinks in ${WORLDTMP}/legacy/
 # will fail to run due to missing binaries. $WMAKE sets PATH to only ${TMPPATH}
 # so we remove that assingnment from $WMAKE and prepend the new $PATH
 	${_+_}@if [ -e "${WORLDTMP}" ]; then \
 		echo ">>> Deleting stale files in build tree..."; \
 		cd ${.CURDIR}; env PATH=${TMPPATH}:${PATH} ${WMAKE:NPATH=*} \
 		    _NO_INCLUDE_COMPILERMK=t -DBATCH_DELETE_OLD_FILES delete-old \
 		    delete-old-libs >/dev/null; \
 	fi
 	rm -rf ${WORLDTMP}/legacy/usr/include
 .if ${USING_SYSTEM_COMPILER} == "yes"
 .for cc in cc c++
 	if [ -x ${WORLDTMP}/usr/bin/${cc} ]; then \
 		inum=$$(stat -f %i ${WORLDTMP}/usr/bin/${cc}); \
 		find ${WORLDTMP}/usr/bin -inum $${inum} -delete; \
 	fi
 .endfor
 .endif	# ${USING_SYSTEM_COMPILER} == "yes"
 .if ${USING_SYSTEM_LINKER} == "yes"
 	@rm -f ${WORLDTMP}/usr/bin/ld ${WORLDTMP}/usr/bin/ld.lld
 .endif	# ${USING_SYSTEM_LINKER} == "yes"
 .endif	# !defined(NO_CLEAN)
 	@mkdir -p ${WORLDTMP}
 	@touch ${WORLDTMP}/${.TARGET}
 # We can't use mtree to create the worldtmp directories since it may not be
 # available on the target system (this happens e.g. when building on non-FreeBSD)
 	cd ${.CURDIR}/tools/build; \
 	    ${MAKE} DIRPRFX=tools/build/ DESTDIR=${WORLDTMP}/legacy installdirs
 # In order to build without inheriting $PATH we need to add symlinks to the host
 # tools in $WORLDTMP for the tools that we don't build during bootstrap-tools
 	cd ${.CURDIR}/tools/build; \
 	    ${MAKE} DIRPRFX=tools/build/ DESTDIR=${WORLDTMP}/legacy host-symlinks
 
 _legacy:
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 1.1: legacy release compatibility shims"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${BMAKE} legacy
 _bootstrap-tools:
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 1.2: bootstrap tools"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${BMAKE} bootstrap-tools
 	mkdir -p ${WORLDTMP}/usr ${WORLDTMP}/lib/casper ${WORLDTMP}/lib/geom
 	${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \
 	    -p ${WORLDTMP}/usr >/dev/null
 	${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \
 	    -p ${WORLDTMP}/usr/include >/dev/null
 	ln -sf ${.CURDIR}/sys ${WORLDTMP}
 .if ${MK_DEBUG_FILES} != "no"
 	${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.debug.dist \
 	    -p ${WORLDTMP}/usr/lib >/dev/null
 .endif
 .for _mtree in ${LOCAL_MTREE}
 	${WORLDTMP_MTREE} -f ${.CURDIR}/${_mtree} -p ${WORLDTMP} > /dev/null
 .endfor
 _cleanobj:
 .if !defined(NO_CLEAN)
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 2.1: cleaning up the object tree"
 	@echo "--------------------------------------------------------------"
 	# Avoid including bsd.compiler.mk in clean and obj with _NO_INCLUDE_COMPILERMK
 	# since the restricted $PATH might not contain a valid cc binary
 	${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t ${CLEANDIR}
 .if defined(LIBCOMPAT)
 	${_+_}cd ${.CURDIR}; ${LIBCOMPATWMAKE} _NO_INCLUDE_COMPILERMK=t -f Makefile.inc1 ${CLEANDIR}
 .endif
 .else
 	${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t _cleanobj_fast_depend_hack
 .endif	# !defined(NO_CLEAN)
 _obj:
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 2.2: rebuilding the object tree"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t obj
 _build-tools:
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 2.3: build tools"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${TMAKE} build-tools
 _cross-tools:
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 3: cross tools"
 	@echo "--------------------------------------------------------------"
 	@rm -f ${OBJTOP}/toolchain-metadata.mk
 	${_+_}cd ${.CURDIR}; ${XMAKE} cross-tools
 	${_+_}cd ${.CURDIR}; ${XMAKE} kernel-tools
 _build-metadata:
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 3.1: recording build metadata"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${WMAKE} toolchain-metadata.mk
 	${_+_}cd ${.CURDIR}; ${WMAKE} host-osreldate.h
 _includes:
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 4.1: building includes"
 	@echo "--------------------------------------------------------------"
 # Special handling for SUBDIR_OVERRIDE in buildworld as they most likely need
 # headers from default SUBDIR.  Do SUBDIR_OVERRIDE includes last.
 	${_+_}cd ${.CURDIR}; ${WMAKE} SUBDIR_OVERRIDE= SHARED=symlinks \
 	    MK_INCLUDES=yes includes
 .if !empty(SUBDIR_OVERRIDE) && make(buildworld)
 	${_+_}cd ${.CURDIR}; ${WMAKE} MK_INCLUDES=yes SHARED=symlinks includes
 .endif
 _libraries:
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 4.2: building libraries"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; \
 	    ${WMAKE} -DNO_FSCHG MK_HTML=no -DNO_LINT MK_MAN=no \
 	    MK_PROFILE=no MK_TESTS=no MK_TESTS_SUPPORT=${MK_TESTS} libraries
 everything: .PHONY
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 4.3: building everything"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; _PARALLEL_SUBDIR_OK=1 ${WMAKE} all
 
 WMAKE_TGTS=
 .if !defined(WORLDFAST)
 WMAKE_TGTS+=	_sanity_check _worldtmp _legacy
 .if empty(SUBDIR_OVERRIDE)
 WMAKE_TGTS+=	_bootstrap-tools
 .endif
 WMAKE_TGTS+=	_cleanobj
 .if !defined(NO_OBJWALK)
 WMAKE_TGTS+=	_obj
 .endif
 WMAKE_TGTS+=	_build-tools _cross-tools
 WMAKE_TGTS+=	_build-metadata
 WMAKE_TGTS+=	_includes
 .endif
 .if !defined(NO_LIBS)
 WMAKE_TGTS+=	_libraries
 .endif
 WMAKE_TGTS+=	everything
 .if defined(LIBCOMPAT) && empty(SUBDIR_OVERRIDE)
 WMAKE_TGTS+=	build${libcompat}
 .endif
 
 # record buildworld time in seconds
 .if make(buildworld)
 _BUILDWORLD_START!= date '+%s'
 .export _BUILDWORLD_START
 .endif
 
 buildworld: buildworld_prologue ${WMAKE_TGTS} buildworld_epilogue .PHONY
 .ORDER: buildworld_prologue ${WMAKE_TGTS} buildworld_epilogue
 
 buildworld_prologue: .PHONY
 	@echo "--------------------------------------------------------------"
 	@echo ">>> World build started on `LC_ALL=C date`"
 	@echo "--------------------------------------------------------------"
 
 buildworld_epilogue: .PHONY
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> World build completed on `LC_ALL=C date`"
 	@seconds=$$(($$(date '+%s') - ${_BUILDWORLD_START})); \
 	  echo -n ">>> World built in $$seconds seconds, "; \
 	  echo "ncpu: $$(sysctl -n hw.ncpu)${.MAKE.JOBS:S/^/, make -j/}"
 	@echo "--------------------------------------------------------------"
 
 #
 # We need to have this as a target because the indirection between Makefile
 # and Makefile.inc1 causes the correct PATH to be used, rather than a
 # modification of the current environment's PATH.  In addition, we need
 # to quote multiword values.
 #
 buildenvvars: .PHONY
 	@echo ${WMAKEENV:Q} ${.MAKE.EXPORTED:@v@$v=\"${$v}\"@}
 
 .if ${.TARGETS:Mbuildenv}
 .if ${.MAKEFLAGS:M-j}
 .error The buildenv target is incompatible with -j
 .endif
 .endif
 BUILDENV_DIR?=	${.CURDIR}
 #
 # Note: make will report any errors the shell reports. This can
 # be odd if the last command in an interactive shell generates an
 # error or is terminated by SIGINT. These reported errors look bad,
 # but are harmless. Allowing them also allows BUIDLENV_SHELL to
 # be a complex command whose status will be returned to the caller.
 # Some scripts in tools rely on this behavior to report build errors.
 #
 buildenv: .PHONY
 	@echo Entering world for ${TARGET_ARCH}:${TARGET}
 .if ${BUILDENV_SHELL:M*zsh*}
 	@echo For ZSH you must run: export CPUTYPE=${TARGET_CPUTYPE}
 .endif
 	@cd ${BUILDENV_DIR} && env ${WMAKEENV} BUILDENV=1 ${BUILDENV_SHELL}
 
 TOOLCHAIN_TGTS=	${WMAKE_TGTS:Neverything:Nbuild${libcompat}}
 toolchain: ${TOOLCHAIN_TGTS} .PHONY
 KERNEL_TOOLCHAIN_TGTS=	${TOOLCHAIN_TGTS:N_obj:N_cleanobj:N_includes:N_libraries}
 .if make(kernel-toolchain)
 .ORDER: ${KERNEL_TOOLCHAIN_TGTS}
 .endif
 kernel-toolchain: ${KERNEL_TOOLCHAIN_TGTS} .PHONY
 
 #
 # installcheck
 #
 # Checks to be sure system is ready for installworld/installkernel.
 #
 installcheck: _installcheck_world _installcheck_kernel .PHONY
 _installcheck_world: .PHONY
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Install check world"
 	@echo "--------------------------------------------------------------"
 _installcheck_kernel: .PHONY
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Install check kernel"
 	@echo "--------------------------------------------------------------"
 
 #
 # Require DESTDIR to be set if installing for a different architecture or
 # using the user/group database in the source tree.
 #
 .if ${TARGET_ARCH} != ${MACHINE_ARCH} || ${TARGET} != ${MACHINE} || \
     defined(DB_FROM_SRC)
 .if !make(distributeworld)
 _installcheck_world: __installcheck_DESTDIR
 _installcheck_kernel: __installcheck_DESTDIR
 __installcheck_DESTDIR: .PHONY
 .if !defined(DESTDIR) || empty(DESTDIR)
 	@echo "ERROR: Please set DESTDIR!"; \
 	false
 .endif
 .endif
 .endif
 
 .if !defined(DB_FROM_SRC)
 #
 # Check for missing UIDs/GIDs.
 #
 CHECK_UIDS=	auditdistd
 CHECK_GIDS=	audit
 CHECK_UIDS+=	ntpd
 CHECK_GIDS+=	ntpd
 CHECK_UIDS+=	proxy
 CHECK_GIDS+=	proxy authpf
 CHECK_UIDS+=	smmsp
 CHECK_GIDS+=	smmsp
 CHECK_UIDS+=	unbound
 CHECK_GIDS+=	unbound
 _installcheck_world: __installcheck_UGID
 __installcheck_UGID: .PHONY
 .for uid in ${CHECK_UIDS}
 	@if ! `id -u ${uid} >/dev/null 2>&1`; then \
 		echo "ERROR: Required ${uid} user is missing, see /usr/src/UPDATING."; \
 		false; \
 	fi
 .endfor
 .for gid in ${CHECK_GIDS}
 	@if ! `find / -prune -group ${gid} >/dev/null 2>&1`; then \
 		echo "ERROR: Required ${gid} group is missing, see /usr/src/UPDATING."; \
 		false; \
 	fi
 .endfor
 .endif
 #
 # If installing over the running system (DESTDIR is / or unset) and the install
 # includes rescue, try running rescue from the objdir as a sanity check.  If
 # rescue is not functional (e.g., because it depends on a system call not
 # supported by the currently running kernel), abort the installation.
 #
 .if !make(distributeworld) && ${MK_RESCUE} != "no" && \
     (empty(DESTDIR) || ${DESTDIR} == "/") && empty(BYPASS_INSTALLCHECK_SH)
 _installcheck_world: __installcheck_sh_check
 __installcheck_sh_check: .PHONY
 	@if [ "`${OBJTOP}/rescue/rescue/rescue sh -c 'echo OK'`" != \
 	    OK ]; then \
 		echo "rescue/sh check failed, installation aborted" >&2; \
 		false; \
 	fi
 .endif
 
 #
 # Required install tools to be saved in a scratch dir for safety.
 #
 .if ${MK_ZONEINFO} != "no"
 _zoneinfo=	zic tzsetup
 .endif
 
 ITOOLS=	[ awk cap_mkdb cat chflags chmod chown cmp cp \
 	date echo egrep find grep id install ${_install-info} \
 	ln make mkdir mtree mv pwd_mkdb \
 	rm sed services_mkdb sh sort strip sysctl test true uname wc ${_zoneinfo} \
 	${LOCAL_ITOOLS}
 
 # Needed for share/man
 .if ${MK_MAN_UTILS} != "no"
 ITOOLS+=makewhatis
 .endif
 
 #
 # distributeworld
 #
 # Distributes everything compiled by a `buildworld'.
 #
 # installworld
 #
 # Installs everything compiled by a 'buildworld'.
 #
 
 # Non-base distributions produced by the base system
 EXTRA_DISTRIBUTIONS=
 .if defined(LIBCOMPAT)
 EXTRA_DISTRIBUTIONS+=	lib${libcompat}
 .endif
 .if ${MK_TESTS} != "no"
 EXTRA_DISTRIBUTIONS+=	tests
 .endif
 
 DEBUG_DISTRIBUTIONS=
 .if ${MK_DEBUG_FILES} != "no"
 DEBUG_DISTRIBUTIONS+=	base ${EXTRA_DISTRIBUTIONS:S,tests,,}
 .endif
 
 MTREE_MAGIC?=	mtree 2.0
 
 distributeworld installworld stageworld: _installcheck_world .PHONY
 	mkdir -p ${INSTALLTMP}
 	progs=$$(for prog in ${ITOOLS}; do \
 		if progpath=`which $$prog`; then \
 			echo $$progpath; \
 		else \
 			echo "Required tool $$prog not found in PATH." >&2; \
 			exit 1; \
 		fi; \
 	    done); \
 	libs=$$(ldd -f "%o %p\n" -f "%o %p\n" $$progs 2>/dev/null | sort -u | \
 	    while read line; do \
 		set -- $$line; \
 		if [ "$$2 $$3" != "not found" ]; then \
 			echo $$2; \
 		else \
 			echo "Required library $$1 not found." >&2; \
 			exit 1; \
 		fi; \
 	    done); \
 	cp $$libs $$progs ${INSTALLTMP}
 	cp -R $${PATH_LOCALE:-"/usr/share/locale"} ${INSTALLTMP}/locale
 .if defined(NO_ROOT)
 	-mkdir -p ${METALOG:H}
 	echo "#${MTREE_MAGIC}" > ${METALOG}
 .endif
 .if make(distributeworld)
 .for dist in ${EXTRA_DISTRIBUTIONS}
 	-mkdir ${DESTDIR}/${DISTDIR}/${dist}
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.root.dist \
 	    -p ${DESTDIR}/${DISTDIR}/${dist} >/dev/null
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \
 	    -p ${DESTDIR}/${DISTDIR}/${dist}/usr >/dev/null
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \
 	    -p ${DESTDIR}/${DISTDIR}/${dist}/usr/include >/dev/null
 .if ${MK_DEBUG_FILES} != "no"
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.debug.dist \
 	    -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib >/dev/null
 .endif
 .if defined(LIBCOMPAT)
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \
 	    -p ${DESTDIR}/${DISTDIR}/${dist}/usr >/dev/null
 .if ${MK_DEBUG_FILES} != "no"
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \
 	    -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib/debug/usr >/dev/null
 .endif
 .endif
 .if ${MK_TESTS} != "no" && ${dist} == "tests"
 	-mkdir -p ${DESTDIR}/${DISTDIR}/${dist}${TESTSBASE}
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \
 	    -p ${DESTDIR}/${DISTDIR}/${dist}${TESTSBASE} >/dev/null
 .if ${MK_DEBUG_FILES} != "no"
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \
 	    -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib/debug/${TESTSBASE} >/dev/null
 .endif
 .endif
 .if defined(NO_ROOT)
 	${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.root.dist | \
 	    sed -e 's#^\./#./${dist}/#' >> ${METALOG}
 	${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.usr.dist | \
 	    sed -e 's#^\./#./${dist}/usr/#' >> ${METALOG}
 	${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.include.dist | \
 	    sed -e 's#^\./#./${dist}/usr/include/#' >> ${METALOG}
 .if defined(LIBCOMPAT)
 	${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist | \
 	    sed -e 's#^\./#./${dist}/usr/#' >> ${METALOG}
 .endif
 .endif
 .endfor
 	-mkdir ${DESTDIR}/${DISTDIR}/base
 	${_+_}cd ${.CURDIR}/etc; ${CROSSENV} PATH=${TMPPATH} ${MAKE} \
 	    METALOG=${METALOG} ${IMAKE_INSTALL} ${IMAKE_MTREE} \
 	    DISTBASE=/base DESTDIR=${DESTDIR}/${DISTDIR}/base \
 	    LOCAL_MTREE=${LOCAL_MTREE:Q} distrib-dirs
 	${INSTALL_SYMLINK} ${INSTALLFLAGS} usr/src/sys ${INSTALL_DDIR}/base/sys
 .endif
 	${_+_}cd ${.CURDIR}; ${IMAKE} re${.TARGET:S/world$//}; \
 	    ${IMAKEENV} rm -rf ${INSTALLTMP}
 .if make(distributeworld)
 .for dist in ${EXTRA_DISTRIBUTIONS}
 	find ${DESTDIR}/${DISTDIR}/${dist} -mindepth 1 -type d -empty -delete
 .endfor
 .if defined(NO_ROOT)
 .for dist in base ${EXTRA_DISTRIBUTIONS}
 	@# For each file that exists in this dist, print the corresponding
 	@# line from the METALOG.  This relies on the fact that
 	@# a line containing only the filename will sort immediately before
 	@# the relevant mtree line.
 	cd ${DESTDIR}/${DISTDIR}; \
 	find ./${dist} | sort -u ${METALOG} - | \
 	awk 'BEGIN { print "#${MTREE_MAGIC}" } !/ type=/ { file = $$1 } / type=/ { if ($$1 == file) { sub(/^\.\/${dist}\//, "./"); print } }' > \
 	${DESTDIR}/${DISTDIR}/${dist}.meta
 .endfor
 .for dist in ${DEBUG_DISTRIBUTIONS}
 	@# For each file that exists in this dist, print the corresponding
 	@# line from the METALOG.  This relies on the fact that
 	@# a line containing only the filename will sort immediately before
 	@# the relevant mtree line.
 	cd ${DESTDIR}/${DISTDIR}; \
 	find ./${dist}/usr/lib/debug | sort -u ${METALOG} - | \
 	awk 'BEGIN { print "#${MTREE_MAGIC}" } !/ type=/ { file = $$1 } / type=/ { if ($$1 == file) { sub(/^\.\/${dist}\//, "./"); print } }' > \
 	${DESTDIR}/${DISTDIR}/${dist}.debug.meta
 .endfor
 .endif
 .endif
 
 packageworld: .PHONY
 .for dist in base ${EXTRA_DISTRIBUTIONS}
 .if defined(NO_ROOT)
 	${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \
 	    tar cvf - --exclude usr/lib/debug \
 	    @${DESTDIR}/${DISTDIR}/${dist}.meta | \
 	    ${XZ_CMD} > ${PACKAGEDIR}/${dist}.txz
 .else
 	${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \
 	    tar cvf - --exclude usr/lib/debug . | \
 	    ${XZ_CMD} > ${PACKAGEDIR}/${dist}.txz
 .endif
 .endfor
 
 .for dist in ${DEBUG_DISTRIBUTIONS}
 . if defined(NO_ROOT)
 	${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \
 	    tar cvf - @${DESTDIR}/${DISTDIR}/${dist}.debug.meta | \
 	    ${XZ_CMD} > ${PACKAGEDIR}/${dist}-dbg.txz
 . else
 	${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \
 	    tar cvLf - usr/lib/debug | \
 	    ${XZ_CMD} > ${PACKAGEDIR}/${dist}-dbg.txz
 . endif
 .endfor
 
 _sysent_dirs=	sys/kern
 _sysent_dirs+=	sys/compat/freebsd32
 _sysent_dirs+=	sys/amd64/linux		\
 		sys/amd64/linux32	\
 		sys/arm64/linux		\
 		sys/i386/linux
 sysent: .PHONY
 .for _dir in ${_sysent_dirs}
 	${_+_}${MAKE} -C ${.CURDIR}/${_dir} sysent
 .endfor
 
 #
 # reinstall
 #
 # If you have a build server, you can NFS mount the source and obj directories
 # and do a 'make reinstall' on the *client* to install new binaries from the
 # most recent server build.
 #
 restage reinstall: .MAKE .PHONY
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Making hierarchy"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 \
 	    LOCAL_MTREE=${LOCAL_MTREE:Q} hierarchy
 .if make(restage)
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Making distribution"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 \
 	    LOCAL_MTREE=${LOCAL_MTREE:Q} distribution
 .endif
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Installing everything started on `LC_ALL=C date`"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 install
 .if defined(LIBCOMPAT)
 	${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 install${libcompat}
 .endif
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Installing everything completed on `LC_ALL=C date`"
 	@echo "--------------------------------------------------------------"
 
 redistribute: .MAKE .PHONY
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Distributing everything"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 distribute
 .if defined(LIBCOMPAT)
 	${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 distribute${libcompat} \
 	    DISTRIBUTION=lib${libcompat}
 .endif
 
 distrib-dirs distribution: .MAKE .PHONY
 	${_+_}cd ${.CURDIR}/etc; ${CROSSENV} PATH=${TMPPATH} ${MAKE} \
 	    ${IMAKE_INSTALL} ${IMAKE_MTREE} METALOG=${METALOG} ${.TARGET}
 .if make(distribution)
 	${_+_}cd ${.CURDIR}; ${CROSSENV} PATH=${TMPPATH} \
 		${MAKE} -f Makefile.inc1 ${IMAKE_INSTALL} \
 		METALOG=${METALOG} MK_TESTS=no installconfig
 .endif
 
 #
 # buildkernel and installkernel
 #
 # Which kernels to build and/or install is specified by setting
 # KERNCONF. If not defined a GENERIC kernel is built/installed.
 # Only the existing (depending TARGET) config files are used
 # for building kernels and only the first of these is designated
 # as the one being installed.
 #
 # Note that we have to use TARGET instead of TARGET_ARCH when
 # we're in kernel-land. Since only TARGET_ARCH is (expected) to
 # be set to cross-build, we have to make sure TARGET is set
 # properly.
 
 .if defined(KERNFAST)
 NO_KERNELCLEAN=	t
 NO_KERNELCONFIG=	t
 NO_KERNELOBJ=		t
 # Shortcut for KERNCONF=Blah -DKERNFAST is now KERNFAST=Blah
 .if !defined(KERNCONF) && ${KERNFAST} != "1"
 KERNCONF=${KERNFAST}
 .endif
 .endif
 .if ${TARGET_ARCH} == "powerpc64"
 KERNCONF?=	GENERIC64
 .else
 KERNCONF?=	GENERIC
 .endif
 INSTKERNNAME?=	kernel
 
 KERNSRCDIR?=	${.CURDIR}/sys
 KRNLCONFDIR=	${KERNSRCDIR}/${TARGET}/conf
 KRNLOBJDIR=	${OBJTOP}${KERNSRCDIR:C,^${.CURDIR},,}
 KERNCONFDIR?=	${KRNLCONFDIR}
 
 BUILDKERNELS=
 INSTALLKERNEL=
 .if defined(NO_INSTALLKERNEL)
 # All of the BUILDKERNELS loops start at index 1.
 BUILDKERNELS+= dummy
 .endif
 .for _kernel in ${KERNCONF}
 .if !defined(_MKSHOWCONFIG) && exists(${KERNCONFDIR}/${_kernel})
 BUILDKERNELS+=	${_kernel}
 .if empty(INSTALLKERNEL) && !defined(NO_INSTALLKERNEL)
 INSTALLKERNEL= ${_kernel}
 .endif
 .else
 .if make(buildkernel)
 .error Missing KERNCONF ${KERNCONFDIR}/${_kernel}
 .endif
 .endif
 .endfor
 
 _cleankernobj_fast_depend_hack: .PHONY
 # 20180320 remove stale generated assym.s after renaming to .inc in r331254
 	@if [ -e "${OBJTOP}/sys/${KERNCONF}/assym.s" ]; then \
 		echo "Removing stale generated assym files"; \
 		rm -f ${OBJTOP}/sys/${KERNCONF}/assym.* \
 		    ${OBJTOP}/sys/${KERNCONF}/.depend.assym.*; \
 	fi
 
 ${WMAKE_TGTS:N_worldtmp:Nbuild${libcompat}} ${.ALLTARGETS:M_*:N_worldtmp}: .MAKE .PHONY
 
 # record kernel(s) build time in seconds
 .if make(buildkernel)
 _BUILDKERNEL_START!= date '+%s'
 .endif
 
 #
 # buildkernel
 #
 # Builds all kernels defined by BUILDKERNELS.
 #
 buildkernel: .MAKE .PHONY
 .if empty(BUILDKERNELS:Ndummy)
 	@echo "ERROR: Missing kernel configuration file(s) (${KERNCONF})."; \
 	false
 .endif
 	@echo
 .for _kernel in ${BUILDKERNELS:Ndummy}
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Kernel build for ${_kernel} started on `LC_ALL=C date`"
 	@echo "--------------------------------------------------------------"
 	@echo "===> ${_kernel}"
 	mkdir -p ${KRNLOBJDIR}
 .if !defined(NO_KERNELCONFIG)
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 1: configuring the kernel"
 	@echo "--------------------------------------------------------------"
 	cd ${KRNLCONFDIR}; \
 		PATH=${TMPPATH} \
 		    config ${CONFIGARGS} -d ${KRNLOBJDIR}/${_kernel} \
 			-I '${KERNCONFDIR}' '${KERNCONFDIR}/${_kernel}'
 .endif
 .if !defined(NO_CLEAN) && !defined(NO_KERNELCLEAN)
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 2.1: cleaning up the object tree"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} ${CLEANDIR}
 .else
 	${_+_}cd ${.CURDIR}; ${WMAKE} _cleankernobj_fast_depend_hack
 .endif
 .if !defined(NO_KERNELOBJ)
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 2.2: rebuilding the object tree"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} obj
 .endif
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 2.3: build tools"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${.CURDIR}; ${KTMAKE} kernel-tools
 	@echo
 	@echo "--------------------------------------------------------------"
 	@echo ">>> stage 3.1: building everything"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} all -DNO_MODULES_OBJ
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Kernel build for ${_kernel} completed on `LC_ALL=C date`"
 	@echo "--------------------------------------------------------------"
 	
 .endfor
 	@seconds=$$(($$(date '+%s') - ${_BUILDKERNEL_START})); \
 	  echo -n ">>> Kernel(s) ${BUILDKERNELS} built in $$seconds seconds, "; \
 	  echo "ncpu: $$(sysctl -n hw.ncpu)${.MAKE.JOBS:S/^/, make -j/}"
 	@echo "--------------------------------------------------------------"
 
 NO_INSTALLEXTRAKERNELS?=	yes
 
 #
 # installkernel, etc.
 #
 # Install the kernel defined by INSTALLKERNEL
 #
 installkernel installkernel.debug \
 reinstallkernel reinstallkernel.debug: _installcheck_kernel .PHONY
 .if !defined(NO_INSTALLKERNEL)
 .if empty(INSTALLKERNEL)
 	@echo "ERROR: No kernel \"${KERNCONF}\" to install."; \
 	false
 .endif
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Installing kernel ${INSTALLKERNEL} on $$(LC_ALL=C date)"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \
 	    ${CROSSENV} PATH=${TMPPATH} \
 	    ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME} ${.TARGET:S/kernel//}
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Installing kernel ${INSTALLKERNEL} completed on $$(LC_ALL=C date)"
 	@echo "--------------------------------------------------------------"
 .endif
 .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes"
 .for _kernel in ${BUILDKERNELS:[2..-1]}
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Installing kernel ${_kernel} $$(LC_ALL=C date)"
 	@echo "--------------------------------------------------------------"
 	${_+_}cd ${KRNLOBJDIR}/${_kernel}; \
 	    ${CROSSENV} PATH=${TMPPATH} \
 	    ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME}.${_kernel} ${.TARGET:S/kernel//}
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Installing kernel ${_kernel} completed on $$(LC_ALL=C date)"
 	@echo "--------------------------------------------------------------"
 .endfor
 .endif
 
 distributekernel distributekernel.debug: .PHONY
 .if !defined(NO_INSTALLKERNEL)
 .if empty(INSTALLKERNEL)
 	@echo "ERROR: No kernel \"${KERNCONF}\" to install."; \
 	false
 .endif
 	mkdir -p ${DESTDIR}/${DISTDIR}
 .if defined(NO_ROOT)
 	@echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.premeta
 .endif
 	${_+_}cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \
 	    ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.premeta/} \
 	    ${IMAKE_MTREE} PATH=${TMPPATH} ${MAKE} KERNEL=${INSTKERNNAME} \
 	    DESTDIR=${INSTALL_DDIR}/kernel \
 	    ${.TARGET:S/distributekernel/install/}
 .if defined(NO_ROOT)
 	@sed -e 's|^./kernel|.|' ${DESTDIR}/${DISTDIR}/kernel.premeta > \
 	    ${DESTDIR}/${DISTDIR}/kernel.meta
 .endif
 .endif
 .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes"
 .for _kernel in ${BUILDKERNELS:[2..-1]}
 .if defined(NO_ROOT)
 	@echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta
 .endif
 	${_+_}cd ${KRNLOBJDIR}/${_kernel}; \
 	    ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.${_kernel}.premeta/} \
 	    ${IMAKE_MTREE} PATH=${TMPPATH} ${MAKE} \
 	    KERNEL=${INSTKERNNAME}.${_kernel} \
 	    DESTDIR=${INSTALL_DDIR}/kernel.${_kernel} \
 	    ${.TARGET:S/distributekernel/install/}
 .if defined(NO_ROOT)
 	@sed -e "s|^./kernel.${_kernel}|.|" \
 	    ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta > \
 	    ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta
 .endif
 .endfor
 .endif
 
 packagekernel: .PHONY
 .if defined(NO_ROOT)
 .if !defined(NO_INSTALLKERNEL)
 	cd ${DESTDIR}/${DISTDIR}/kernel; \
 	    tar cvf - --exclude '*.debug' \
 	    @${DESTDIR}/${DISTDIR}/kernel.meta | \
 	    ${XZ_CMD} > ${PACKAGEDIR}/kernel.txz
 .endif
 .if ${MK_DEBUG_FILES} != "no"
 	cd ${DESTDIR}/${DISTDIR}/kernel; \
 	    tar cvf - --include '*/*/*.debug' \
 	    @${DESTDIR}/${DISTDIR}/kernel.meta | \
 	    ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz
 .endif
 .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes"
 .for _kernel in ${BUILDKERNELS:[2..-1]}
 	cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \
 	    tar cvf - --exclude '*.debug' \
 	    @${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta | \
 	    ${XZ_CMD} > ${PACKAGEDIR}/kernel.${_kernel}.txz
 .if ${MK_DEBUG_FILES} != "no"
 	cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \
 	    tar cvf - --include '*/*/*.debug' \
 	    @${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta | \
 	    ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}-dbg.txz
 .endif
 .endfor
 .endif
 .else
 .if !defined(NO_INSTALLKERNEL)
 	cd ${DESTDIR}/${DISTDIR}/kernel; \
 	    tar cvf - --exclude '*.debug' . | \
 	    ${XZ_CMD} > ${PACKAGEDIR}/kernel.txz
 .endif
 .if ${MK_DEBUG_FILES} != "no"
 	cd ${DESTDIR}/${DISTDIR}/kernel; \
 	    tar cvf - --include '*/*/*.debug' $$(eval find .) | \
 	    ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz
 .endif
 .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes"
 .for _kernel in ${BUILDKERNELS:[2..-1]}
 	cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \
 	    tar cvf - --exclude '*.debug' . | \
 	    ${XZ_CMD} > ${PACKAGEDIR}/kernel.${_kernel}.txz
 .if ${MK_DEBUG_FILES} != "no"
 	cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \
 	    tar cvf - --include '*/*/*.debug' $$(eval find .) | \
 	    ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}-dbg.txz
 .endif
 .endfor
 .endif
 .endif
 
 stagekernel: .PHONY
 	${_+_}${MAKE} -C ${.CURDIR} ${.MAKEFLAGS} distributekernel
 
 PORTSDIR?=	/usr/ports
 WSTAGEDIR?=	${OBJTOP}/worldstage
 KSTAGEDIR?=	${OBJTOP}/kernelstage
 REPODIR?=	${OBJROOT}repo
 PKGSIGNKEY?=	# empty
 
 .ORDER:		stage-packages create-packages
 .ORDER:		create-packages create-world-packages
 .ORDER:		create-packages create-kernel-packages
 .ORDER:		create-packages sign-packages
 
 _pkgbootstrap: .PHONY
 .if make(*package*) && !exists(${LOCALBASE}/sbin/pkg)
 	@env ASSUME_ALWAYS_YES=YES pkg bootstrap
 .endif
 
 packages: .PHONY
 	${_+_}${MAKE} -C ${.CURDIR} PKG_VERSION=${PKG_VERSION} real-packages
 
 package-pkg: .PHONY
 	rm -rf /tmp/ports.${TARGET} || :
 	env ${WMAKEENV:Q} SRCDIR=${.CURDIR} PORTSDIR=${PORTSDIR} REVISION=${_REVISION} \
 		PKG_CMD=${PKG_CMD} PKG_VERSION=${PKG_VERSION} REPODIR=${REPODIR} \
 		WSTAGEDIR=${WSTAGEDIR} \
 		sh ${.CURDIR}/release/scripts/make-pkg-package.sh
 
 real-packages:	stage-packages create-packages sign-packages .PHONY
 
 stage-packages-world: .PHONY
 	@mkdir -p ${WSTAGEDIR}
 	${_+_}@cd ${.CURDIR}; \
 		${MAKE} DESTDIR=${WSTAGEDIR} -DNO_ROOT stageworld
 
 stage-packages-kernel: .PHONY
 	@mkdir -p ${KSTAGEDIR}
 	${_+_}@cd ${.CURDIR}; \
 		${MAKE} DESTDIR=${KSTAGEDIR} -DNO_ROOT stagekernel
 
 stage-packages: .PHONY stage-packages-world stage-packages-kernel
 
 _repodir: .PHONY
 	@mkdir -p ${REPODIR}
 
 create-packages-world:	_pkgbootstrap _repodir .PHONY
 	${_+_}@cd ${.CURDIR}; \
 		${MAKE} -f Makefile.inc1 \
 			DESTDIR=${WSTAGEDIR} \
 			PKG_VERSION=${PKG_VERSION} create-world-packages
 
 create-packages-kernel:	_pkgbootstrap _repodir .PHONY
 	${_+_}@cd ${.CURDIR}; \
 		${MAKE} -f Makefile.inc1 \
 			DESTDIR=${KSTAGEDIR} \
 			PKG_VERSION=${PKG_VERSION} DISTDIR=kernel \
 			create-kernel-packages
 
 create-packages: .PHONY create-packages-world create-packages-kernel
 
 create-world-packages:	_pkgbootstrap .PHONY
 	@rm -f ${WSTAGEDIR}/*.plist 2>/dev/null || :
 	@cd ${WSTAGEDIR} ; \
 		env -i LC_COLLATE=C sort ${WSTAGEDIR}/${DISTDIR}/METALOG | \
 		awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk
 	@for plist in ${WSTAGEDIR}/*.plist; do \
 	  plist=$${plist##*/} ; \
 	  pkgname=$${plist%.plist} ; \
 	  echo "_PKGS+= $${pkgname}" ; \
 	done > ${WSTAGEDIR}/packages.mk
 	${_+_}@cd ${.CURDIR}; \
 		${MAKE} -f Makefile.inc1 create-world-packages-jobs \
 		.MAKE.JOB.PREFIX=
 
 .if make(create-world-packages-jobs)
 .include "${WSTAGEDIR}/packages.mk"
 .endif
 
 create-world-packages-jobs: .PHONY
 .for pkgname in ${_PKGS}
 create-world-packages-jobs: create-world-package-${pkgname}
 create-world-package-${pkgname}: .PHONY
 	@sh ${SRCDIR}/release/packages/generate-ucl.sh -o ${pkgname} \
 		-s ${SRCDIR} -u ${WSTAGEDIR}/${pkgname}.ucl
 	@awk -F\" ' \
 		/^name/ { printf("===> Creating %s-", $$2); next } \
 		/^version/ { print $$2; next } \
 		' ${WSTAGEDIR}/${pkgname}.ucl
 	@if [ "${pkgname}" == "runtime" ]; then \
 		sed -i '' -e "s/%VCS_REVISION%/${VCS_REVISION}/" ${WSTAGEDIR}/${pkgname}.ucl ; \
 	fi
 	${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \
 		create -M ${WSTAGEDIR}/${pkgname}.ucl \
 		-p ${WSTAGEDIR}/${pkgname}.plist \
 		-r ${WSTAGEDIR} \
 		-o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION}
 .endfor
 
 _default_flavor=	-default
 .if make(*package*) && exists(${KSTAGEDIR}/kernel.meta)
 . if ${MK_DEBUG_FILES} != "no"
 _debug=-debug
 . endif
 create-kernel-packages:	.PHONY
 . for flavor in "" ${_debug}
 create-kernel-packages: create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},}
 create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},}: _pkgbootstrap .PHONY
 	@cd ${KSTAGEDIR}/${DISTDIR} ; \
 	env -i LC_COLLATE=C sort ${KSTAGEDIR}/kernel.meta | \
 	awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \
 		-v kernel=yes -v _kernconf=${INSTALLKERNEL} ; \
 	sed -e "s/%VERSION%/${PKG_VERSION}/" \
 		-e "s/%PKGNAME%/kernel-${INSTALLKERNEL:tl}${flavor}/" \
 		-e "s/%KERNELDIR%/kernel/" \
 		-e "s/%COMMENT%/FreeBSD ${INSTALLKERNEL} kernel ${flavor}/" \
 		-e "s/%DESC%/FreeBSD ${INSTALLKERNEL} kernel ${flavor}/" \
 		-e "s/ %VCS_REVISION%/${VCS_REVISION}/" \
 		${SRCDIR}/release/packages/kernel.ucl \
 		> ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \
 	awk -F\" ' \
 		/name/ { printf("===> Creating %s-", $$2); next } \
 		/version/ {print $$2; next } ' \
 		${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \
 	${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \
 		create -M ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl \
 		-p ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.plist \
 		-r ${KSTAGEDIR}/${DISTDIR} \
 		-o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION}
 . endfor
 .endif
 .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes"
 . for _kernel in ${BUILDKERNELS:[2..-1]}
 .  if exists(${KSTAGEDIR}/kernel.${_kernel}.meta)
 .   if ${MK_DEBUG_FILES} != "no"
 _debug=-debug
 .   endif
 .   for flavor in "" ${_debug}
 create-kernel-packages: create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kernel}
 create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kernel}: _pkgbootstrap .PHONY
 	@cd ${KSTAGEDIR}/kernel.${_kernel} ; \
 	env -i LC_COLLATE=C sort ${KSTAGEDIR}/kernel.${_kernel}.meta | \
 	awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \
 		-v kernel=yes -v _kernconf=${_kernel} ; \
 	sed -e "s/%VERSION%/${PKG_VERSION}/" \
 		-e "s/%PKGNAME%/kernel-${_kernel:tl}${flavor}/" \
 		-e "s/%KERNELDIR%/kernel.${_kernel}/" \
 		-e "s/%COMMENT%/FreeBSD ${_kernel} kernel ${flavor}/" \
 		-e "s/%DESC%/FreeBSD ${_kernel} kernel ${flavor}/" \
 		-e "s/ %VCS_REVISION%/${VCS_REVISION}/" \
 		${SRCDIR}/release/packages/kernel.ucl \
 		> ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \
 	awk -F\" ' \
 		/name/ { printf("===> Creating %s-", $$2); next } \
 		/version/ {print $$2; next } ' \
 		${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \
 	${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \
 		create -M ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl \
 		-p ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.plist \
 		-r ${KSTAGEDIR}/kernel.${_kernel} \
 		-o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION}
 .   endfor
 .  endif
 . endfor
 .endif
 
 sign-packages:	_pkgbootstrap .PHONY
 	@[ -L "${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/latest" ] && \
 		unlink ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/latest ; \
 	${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname repo \
 		-o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} \
 		${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} \
 		${PKGSIGNKEY} ; \
 	cd ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI); \
 	ln -s ${PKG_VERSION} latest
 
 #
 #
 # checkworld
 #
 # Run test suite on installed world.
 #
 checkworld: .PHONY
 	@if [ ! -x "${LOCALBASE}/bin/kyua" ]; then \
 		echo "You need kyua (devel/kyua) to run the test suite." | /usr/bin/fmt; \
 		exit 1; \
 	fi
 	${_+_}PATH="$$PATH:${LOCALBASE}/bin" kyua test -k ${TESTSBASE}/Kyuafile
 
 #
 #
 # doxygen
 #
 # Build the API documentation with doxygen
 #
 doxygen: .PHONY
 	@if [ ! -x "${LOCALBASE}/bin/doxygen" ]; then \
 		echo "You need doxygen (devel/doxygen) to generate the API documentation of the kernel." | /usr/bin/fmt; \
 		exit 1; \
 	fi
 	${_+_}cd ${.CURDIR}/tools/kerneldoc/subsys; ${MAKE} obj all
 
 #
 # update
 #
 # Update the source tree(s), by running svn/svnup to update to the
 # latest copy.
 #
 update: .PHONY
 .if defined(SVN_UPDATE)
 	@echo "--------------------------------------------------------------"
 	@echo ">>> Updating ${.CURDIR} using Subversion"
 	@echo "--------------------------------------------------------------"
 	@(cd ${.CURDIR}; ${SVN_CMD} update ${SVNFLAGS})
 .endif
 
 #
 # ------------------------------------------------------------------------
 #
 # From here onwards are utility targets used by the 'make world' and
 # related targets.  If your 'world' breaks, you may like to try to fix
 # the problem and manually run the following targets to attempt to
 # complete the build.  Beware, this is *not* guaranteed to work, you
 # need to have a pretty good grip on the current state of the system
 # to attempt to manually finish it.  If in doubt, 'make world' again.
 #
 
 #
 # legacy: Build compatibility shims for the next three targets. This is a
 # minimal set of tools and shims necessary to compensate for older systems
 # which don't have the APIs required by the targets built in bootstrap-tools,
 # build-tools or cross-tools.
 #
 
 
 # libnv and libl are both requirements for config(8), which is an unconditional
 # bootstrap-tool.
 _config_deps= lib/libnv usr.bin/lex/lib
 
 legacy: .PHONY
 .if ${BOOTSTRAPPING} < ${MINIMUM_SUPPORTED_OSREL} && ${BOOTSTRAPPING} != 0
 	@echo "ERROR: Source upgrades from versions prior to ${MINIMUM_SUPPORTED_REL} are not supported."; \
 	false
 .endif
 
 .for _tool in tools/build ${_config_deps}
 	${_+_}@${ECHODIR} "===> ${_tool} (obj,includes,all,install)"; \
 	    cd ${.CURDIR}/${_tool}; \
 	    if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \
 	    ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP}/legacy includes; \
 	    ${MAKE} DIRPRFX=${_tool}/ MK_INCLUDES=no all; \
 	    ${MAKE} DIRPRFX=${_tool}/ MK_INCLUDES=no \
 	        DESTDIR=${WORLDTMP}/legacy install
 .endfor
 
 #
 # bootstrap-tools: Build tools needed for compatibility. These are binaries that
 # are built to build other binaries in the system. However, the focus of these
 # binaries is usually quite narrow. Bootstrap tools use the host's compiler and
 # libraries, augmented by -legacy, in addition to the libraries built during
 # bootstrap-tools.
 #
 _bt=		_bootstrap-tools
 
 # We want to run the build with only ${WORLDTMP} in $PATH to ensure we don't
 # accidentally run tools that are incompatible but happen to be in $PATH.
 # This is especially important when building on Linux/MacOS where many of the
 # programs used during the build accept different flags or generate different
 # output. On those platforms we only symlink the tools known to be compatible
 # (e.g. basic utilities such as mkdir) into ${WORLDTMP} and build all others
 # from the FreeBSD sources during the bootstrap-tools stage.
 # We want to build without the user's $PATH starting in the bootstrap-tools
 # phase so the tools used in that phase (ln, cp, etc) must have already been
 # linked to $WORLDTMP. The tools are listed in the _host_tools_to_symlink
 # variable in tools/build/Makefile and are linked during the legacy phase.
 # Since they could be Linux or MacOS binaries, too we must only use flags that
 # are portable across operating systems.
 
 # If BOOTSTRAP_ALL_TOOLS is set we will build all the required tools from the
 # current source tree. Otherwise we create a symlink to the version found in
 # $PATH during the bootstrap-tools stage.
 .if defined(BOOTSTRAP_ALL_TOOLS)
 # BOOTSTRAPPING will be set on the command line so we can't override it here.
 # Instead set BOOTSTRAPPING_OSRELDATE so that the value 0 is set ${BSARGS}
 BOOTSTRAPPING_OSRELDATE:=	0
 .endif
 
 .if ${MK_GAMES} != "no"
 _strfile=	usr.bin/fortune/strfile
 .endif
 
 .if ${MK_GCC} != "no" && ${MK_CXX} != "no"
 _gperf=		gnu/usr.bin/gperf
 .endif
 
 .if ${MK_VT} != "no"
 _vtfontcvt=	usr.bin/vtfontcvt
 .endif
 
 # If we are not building the bootstrap because BOOTSTRAPPING is sufficient
 # we symlink the host version to $WORLDTMP instead. By doing this we can also
 # detect when a bootstrap tool is being used without the required MK_FOO.
 # If you add a new bootstrap tool where we could also use the host version,
 # please ensure that you also add a .else case where you add the tool to the
 # _bootstrap_tools_links variable.
 .if ${BOOTSTRAPPING} < 1000033
 _m4=		usr.bin/m4
 _lex=		usr.bin/lex
 # Note: lex needs m4 to build but m4 also depends on lex. However, lex can be
 # bootstrapped so we build lex first.
 ${_bt}-usr.bin/m4: ${_bt}-lib/libopenbsd ${_bt}-usr.bin/yacc ${_bt}-${_lex}
 _bt_m4_depend=${_bt}-${_m4}
 _bt_lex_depend=${_bt}-${_lex} ${_bt_m4_depend}
 .else
 _bootstrap_tools_links+=m4 lex
 .endif
 
 # ELF Tool Chain libraries are needed for ELF tools and dtrace tools.
 # r296685 fix cross-endian objcopy
 # r310724 fixed PR 215350, a crash in libdwarf with objects built by GCC 6.2.
 # r334881 added libdwarf constants used by ctfconvert.
 # r338478 fixed a crash in objcopy for mips64el objects
 # r339083 libelf: correct mips64el test to use ELF header
 # r348347 Add missing powerpc64 relocation support to libdwarf
 .if ${BOOTSTRAPPING} < 1300030
 _elftoolchain_libs= lib/libelf lib/libdwarf
 ${_bt}-lib/libelf: ${_bt_m4_depend}
 ${_bt}-lib/libdwarf: ${_bt_m4_depend}
 .endif
 
 # r245440 mtree -N support added
 # r313404 requires sha384.h for libnetbsd, added to libmd in r292782
 .if ${BOOTSTRAPPING} < 1100093
 _nmtree=	lib/libmd \
 		lib/libnetbsd \
 		usr.sbin/nmtree
 
 ${_bt}-lib/libnetbsd: ${_bt}-lib/libmd
 ${_bt}-usr.sbin/nmtree: ${_bt}-lib/libnetbsd
 .else
 _bootstrap_tools_links+=mtree
 .endif
 
 # r246097: log addition login.conf.db, passwd, pwd.db, and spwd.db with cat -l
 .if ${BOOTSTRAPPING} < 1000027
 _cat=		bin/cat
 .else
 _bootstrap_tools_links+=cat
 .endif
 
 # r277259 crunchide: Correct 64-bit section header offset
 # r281674 crunchide: always include both 32- and 64-bit ELF support
 .if ${BOOTSTRAPPING} < 1100078
 _crunchide=	usr.sbin/crunch/crunchide
 .else
 _bootstrap_tools_links+=crunchide
 .endif
 
 # r285986 crunchen: use STRIPBIN rather than STRIP
 # 1100113: Support MK_AUTO_OBJ
 # 1200006: META_MODE fixes
 .if ${BOOTSTRAPPING} < 1100078 || \
     (${MK_AUTO_OBJ} == "yes" && ${BOOTSTRAPPING} < 1100114) || \
     (${MK_META_MODE} == "yes" && ${BOOTSTRAPPING} < 1200006)
 _crunchgen=	usr.sbin/crunch/crunchgen
 .else
 _bootstrap_tools_links+=crunchgen
 .endif
 
 # r296926 -P keymap search path, MFC to stable/10 in r298297
 .if ${BOOTSTRAPPING} < 1003501 || \
 	(${BOOTSTRAPPING} >= 1100000 && ${BOOTSTRAPPING} < 1100103)
 _kbdcontrol=	usr.sbin/kbdcontrol
 .else
 _bootstrap_tools_links+=kbdcontrol
 .endif
 
 _yacc=		lib/liby \
 		usr.bin/yacc
 
 ${_bt}-usr.bin/yacc: ${_bt}-lib/liby
 
 .if ${MK_BSNMP} != "no"
 _gensnmptree=	usr.sbin/bsnmpd/gensnmptree
 .endif
 
 .if ${MK_LOCALES} != "no"
 _localedef=	usr.bin/localedef
 .endif
 
 # We need to build tblgen when we're building clang or lld, either as
 # bootstrap tools, or as the part of the normal build.
 .if ${MK_CLANG_BOOTSTRAP} != "no" || ${MK_CLANG} != "no" || \
     ${MK_LLD_BOOTSTRAP} != "no" || ${MK_LLD} != "no"
 _clang_tblgen= \
 	lib/clang/libllvmminimal \
 	usr.bin/clang/llvm-tblgen \
 	usr.bin/clang/clang-tblgen \
 	usr.bin/clang/lldb-tblgen
 # XXX: lldb-tblgen is not needed, if top-level MK_LLDB=no
 
 ${_bt}-usr.bin/clang/clang-tblgen: ${_bt}-lib/clang/libllvmminimal
 ${_bt}-usr.bin/clang/llvm-tblgen: ${_bt}-lib/clang/libllvmminimal
 ${_bt}-usr.bin/clang/lldb-tblgen: ${_bt}-lib/clang/libllvmminimal
 .endif
 
 # Default to building the GPL DTC, but build the BSDL one if users explicitly
 # request it.
 _dtc= usr.bin/dtc
 .if ${MK_GPL_DTC} != "no"
 _dtc= gnu/usr.bin/dtc
 .endif
 
 .if ${MK_LOCALES} != "no"
 _localedef=	usr.bin/localedef
 .endif
 
 .if ${MK_KERBEROS} != "no"
 _kerberos5_bootstrap_tools= \
 	kerberos5/tools/make-roken \
 	kerberos5/lib/libroken \
 	kerberos5/lib/libvers \
 	kerberos5/tools/asn1_compile \
 	kerberos5/tools/slc \
 	usr.bin/compile_et
 
 .ORDER: ${_kerberos5_bootstrap_tools:C/^/${_bt}-/g}
 .for _tool in ${_kerberos5_bootstrap_tools}
 ${_bt}-${_tool}: ${_bt}-usr.bin/yacc ${_bt_lex_depend}
 .endfor
 .endif
 
 ${_bt}-usr.bin/mandoc: ${_bt}-lib/libopenbsd
 
 # The tools listed in _basic_bootstrap_tools will generally not be
 # bootstrapped unless BOOTSTRAP_ALL_TOOL is set. However, when building on a
 # Linux or MacOS host the host versions are incompatible so we need to build
 # them from the source tree. Usually the link name will be the same as the subdir,
 # but some directories such as grep or test install multiple binaries. In that
 # case we use the _basic_bootstrap_tools_multilink variable which is a list of
 # subdirectory and comma-separated list of files.
 _basic_bootstrap_tools_multilink=usr.bin/grep grep,egrep,fgrep
 _basic_bootstrap_tools_multilink+=bin/test test,[
 # bootstrap tools needed by buildworld:
 _basic_bootstrap_tools=usr.bin/awk usr.bin/cut bin/expr usr.bin/gencat \
     usr.bin/join usr.bin/mktemp bin/rmdir usr.bin/sed usr.bin/sort \
     usr.bin/truncate usr.bin/tsort
 # elf2aout is required for sparc64 build
 _basic_bootstrap_tools+=usr.bin/elf2aout
 # file2c is required for building usr.sbin/config:
 _basic_bootstrap_tools+=usr.bin/file2c
 # uuencode/uudecode required for share/tabset
 _basic_bootstrap_tools+=usr.bin/uuencode usr.bin/uudecode
 # xargs is required by mkioctls
 _basic_bootstrap_tools+=usr.bin/xargs
 # cap_mkdb is required for share/termcap:
 _basic_bootstrap_tools+=usr.bin/cap_mkdb
 # ldd is required for installcheck (TODO: just always use /usr/bin/ldd instead?)
 _basic_bootstrap_tools+=usr.bin/ldd
 # services_mkdb/pwd_mkdb are required for installworld:
 _basic_bootstrap_tools+=usr.sbin/services_mkdb usr.sbin/pwd_mkdb
 # sysctl/chflags are required for installkernel:
 _basic_bootstrap_tools+=sbin/sysctl bin/chflags
 # mkfifo is used by sys/conf/newvers.sh
 _basic_bootstrap_tools+=usr.bin/mkfifo
 
 .if ${MK_AMD} != "no"
 # unifdef is only used by usr.sbin/amd/libamu/Makefile
 _basic_bootstrap_tools+=usr.bin/unifdef
 .endif
 
 .if ${MK_BOOT} != "no"
 _basic_bootstrap_tools+=bin/dd
 # xz/unxz is used by EFI
 _basic_bootstrap_tools_multilink+=usr.bin/xz xz,unxz
 # md5 is used by boot/beri (and possibly others)
 _basic_bootstrap_tools+=sbin/md5
 .if defined(BOOTSTRAP_ALL_TOOLS)
 ${_bt}-sbin/md5: ${_bt}-lib/libmd
 .endif
 .endif
 
 .if ${MK_ZONEINFO} != "no"
 _basic_bootstrap_tools+=usr.sbin/zic usr.sbin/tzsetup
 .endif
 
 .if defined(BOOTSTRAP_ALL_TOOLS)
 _other_bootstrap_tools+=${_basic_bootstrap_tools}
 .for _subdir _links in ${_basic_bootstrap_tools_multilink}
 _other_bootstrap_tools+=${_subdir}
 .endfor
 ${_bt}-usr.bin/awk: ${_bt_lex_depend} ${_bt}-usr.bin/yacc
 ${_bt}-bin/expr: ${_bt_lex_depend} ${_bt}-usr.bin/yacc
 # If we are bootstrapping file2c, we have to build it before config:
 ${_bt}-usr.sbin/config: ${_bt}-usr.bin/file2c ${_bt_lex_depend}
 # Note: no symlink to make/bmake in the !BOOTSTRAP_ALL_TOOLS case here since
 # the links to make/bmake make links will have already have been created in the
 # `make legacy` step. Not adding a link to make is important on non-FreeBSD
 # since "make" will usually point to GNU make there.
 _other_bootstrap_tools+=usr.bin/bmake
 .else
 # All tools in _basic_bootstrap_tools have the same name as the subdirectory
 # so we can use :T to get the name of the symlinks that we need to create.
 _bootstrap_tools_links+=${_basic_bootstrap_tools:T}
 .for _subdir _links in ${_basic_bootstrap_tools_multilink}
 _bootstrap_tools_links+=${_links:S/,/ /g}
 .endfor
 .endif	# defined(BOOTSTRAP_ALL_TOOLS)
 
 # Link the tools that we need for building but don't need to bootstrap because
 # the host version is known to be compatible into ${WORLDTMP}/legacy
 # We do this before building any of the bootstrap tools in case they depend on
 # the presence of any of the links (e.g. as m4/lex/awk)
 ${_bt}-links: .PHONY
 
 .for _tool in ${_bootstrap_tools_links}
 ${_bt}-link-${_tool}: .PHONY .MAKE
 	@if [ ! -e "${WORLDTMP}/legacy/bin/${_tool}" ]; then \
 		source_path=`which ${_tool}`; \
 		if [ ! -e "$${source_path}" ] ; then \
 			echo "Cannot find host tool '${_tool}'"; false; \
 		fi; \
 		ln -sfnv "$${source_path}" "${WORLDTMP}/legacy/bin/${_tool}"; \
 	fi
 ${_bt}-links: ${_bt}-link-${_tool}
 .endfor
 
 
 bootstrap-tools: ${_bt}-links .PHONY
 
 #	Please document (add comment) why something is in 'bootstrap-tools'.
 #	Try to bound the building of the bootstrap-tool to just the
 #	FreeBSD versions that need the tool built at this stage of the build.
 .for _tool in \
     ${_clang_tblgen} \
     ${_kerberos5_bootstrap_tools} \
     ${_strfile} \
     ${_gperf} \
     ${_dtc} \
     ${_cat} \
     ${_kbdcontrol} \
     ${_elftoolchain_libs} \
     usr.bin/lorder \
     lib/libopenbsd \
     usr.bin/mandoc \
     usr.bin/rpcgen \
     ${_yacc} \
     ${_m4} \
     ${_lex} \
     ${_other_bootstrap_tools} \
     usr.bin/xinstall \
     ${_gensnmptree} \
     usr.sbin/config \
     ${_crunchide} \
     ${_crunchgen} \
     ${_nmtree} \
     ${_vtfontcvt} \
     ${_localedef}
 ${_bt}-${_tool}: ${_bt}-links .PHONY .MAKE
 	${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \
 		cd ${.CURDIR}/${_tool}; \
 		if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \
 		if [ "${_tool}" = "usr.bin/lex" ]; then \
 			${MAKE} DIRPRFX=${_tool}/ bootstrap; \
 		fi; \
 		${MAKE} DIRPRFX=${_tool}/ all; \
 		${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP}/legacy install
 
 bootstrap-tools: ${_bt}-${_tool}
 .endfor
 
 #
 # build-tools: Build special purpose build tools
 #
 .if !defined(NO_SHARE) && ${MK_SYSCONS} != "no"
 _share=	share/syscons/scrnmaps
 .endif
 
 .if ${MK_GCC} != "no"
 _gcc_tools= gnu/usr.bin/cc/cc_tools
 .endif
 
 .if ${MK_RESCUE} != "no"
 # rescue includes programs that have build-tools targets
 _rescue=rescue/rescue
 .endif
 
 .if ${MK_TCSH} != "no"
 _tcsh=bin/csh
 .endif
 .if ${MK_FILE} != "no"
 _libmagic=lib/libmagic
 .endif
 
 .if ${MK_PMC} != "no" && \
     (${TARGET_ARCH} == "amd64" || ${TARGET_ARCH} == "i386")
 _jevents=lib/libpmc/pmu-events
 .endif
 
 # kernel-toolchain skips _cleanobj, so handle cleaning up previous
 # build-tools directories if needed.
 .if !defined(NO_CLEAN) && make(kernel-toolchain)
 _bt_clean=	${CLEANDIR}
 .endif
 
 .for _tool in \
     ${_tcsh} \
     bin/sh \
     ${LOCAL_TOOL_DIRS} \
     ${_jevents} \
     lib/ncurses/ncurses \
     lib/ncurses/ncursesw \
     ${_rescue} \
     ${_share} \
     usr.bin/awk \
     ${_libmagic} \
     usr.bin/mkesdb_static \
     usr.bin/mkcsmapper_static \
     usr.bin/vi/catalog \
     ${_gcc_tools}
 build-tools_${_tool}: .PHONY
 	${_+_}@${ECHODIR} "===> ${_tool} (${_bt_clean:D${_bt_clean},}obj,build-tools)"; \
 		cd ${.CURDIR}/${_tool}; \
 		if [ -n "${_bt_clean}" ]; then ${MAKE} DIRPRFX=${_tool}/ ${_bt_clean}; fi; \
 		if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \
 		${MAKE} DIRPRFX=${_tool}/ build-tools
 build-tools: build-tools_${_tool}
 .endfor
 
 #
 # kernel-tools: Build kernel-building tools
 #
 kernel-tools: .PHONY
 	mkdir -p ${WORLDTMP}/usr
 	${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \
 	    -p ${WORLDTMP}/usr >/dev/null
 
 #
 # cross-tools: All the tools needed to build the rest of the system after
 # we get done with the earlier stages. It is the last set of tools needed
 # to begin building the target binaries.
 #
 .if ${TARGET_ARCH} != ${MACHINE_ARCH} || ${BUILD_WITH_STRICT_TMPPATH} != 0
 .if ${TARGET_ARCH} == "amd64" || ${TARGET_ARCH} == "i386"
 _btxld=		usr.sbin/btxld
 .endif
 .endif
 
 # Rebuild ctfconvert and ctfmerge to avoid difficult-to-diagnose failures
 # resulting from missing bug fixes or ELF Toolchain updates.
 .if ${MK_CDDL} != "no"
 _dtrace_tools= cddl/lib/libctf cddl/usr.bin/ctfconvert \
     cddl/usr.bin/ctfmerge
 .endif
 
 # If we're given an XAS, don't build binutils.
 .if ${XAS:M/*} == ""
 .if ${MK_BINUTILS_BOOTSTRAP} != "no"
 _binutils=	gnu/usr.bin/binutils
 .endif
 .if ${MK_ELFTOOLCHAIN_BOOTSTRAP} != "no"
 _elftctools=	lib/libelftc \
 		lib/libpe \
 		usr.bin/objcopy \
 		usr.bin/nm \
 		usr.bin/size \
 		usr.bin/strings
 # These are not required by the build, but can be useful for developers who
 # cross-build on a FreeBSD 10 host:
 _elftctools+=	usr.bin/addr2line
 .endif
 .elif ${TARGET_ARCH} != ${MACHINE_ARCH} && ${MK_ELFTOOLCHAIN_BOOTSTRAP} != "no"
 # If cross-building with an external binutils we still need to build strip for
 # the target (for at least crunchide).
 _elftctools=	lib/libelftc \
 		lib/libpe \
 		usr.bin/objcopy
 .endif
 
 .if ${MK_CLANG_BOOTSTRAP} != "no"
 _clang=		usr.bin/clang
 .endif
 .if ${MK_LLD_BOOTSTRAP} != "no"
 _lld=		usr.bin/clang/lld
 .endif
 .if ${MK_CLANG_BOOTSTRAP} != "no" || ${MK_LLD_BOOTSTRAP} != "no"
 _clang_libs=	lib/clang
 .endif
 .if ${MK_GCC_BOOTSTRAP} != "no"
 _gcc=		gnu/usr.bin/cc
 .endif
 .if ${MK_USB} != "no"
 _usb_tools=	stand/usb/tools
 .endif
 
 .if ${BUILD_WITH_STRICT_TMPPATH} != 0 || defined(BOOTSTRAP_ALL_TOOLS)
 _ar=usr.bin/ar
 .endif
 
 cross-tools: .MAKE .PHONY
 .for _tool in \
     ${LOCAL_XTOOL_DIRS} \
     ${_ar} \
     ${_clang_libs} \
     ${_clang} \
     ${_lld} \
     ${_binutils} \
     ${_elftctools} \
     ${_dtrace_tools} \
     ${_gcc} \
     ${_btxld} \
     ${_usb_tools}
 	${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \
 		cd ${.CURDIR}/${_tool}; \
 		if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \
 		${MAKE} DIRPRFX=${_tool}/ all; \
 		${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP} install
 .endfor
 
 #
 # native-xtools is the current target for qemu-user cross builds of ports
 # via poudriere and the imgact_binmisc kernel module.
 # This target merely builds a toolchan/sysroot, then builds the tools it wants
 # with the options it wants in a special MAKEOBJDIRPREFIX, using the toolchain
 # already built.  It then installs the static tools to NXBDESTDIR for Poudriere
 # to pickup.
 #
 NXBOBJROOT=	${OBJROOT}${MACHINE}.${MACHINE_ARCH}/nxb/
 NXBOBJTOP=	${NXBOBJROOT}${NXB_TARGET}.${NXB_TARGET_ARCH}
 NXTP?=		/nxb-bin
 .if ${NXTP:N/*}
 .error NXTP variable should be an absolute path
 .endif
 NXBDESTDIR?=	${DESTDIR}${NXTP}
 
 # This is the list of tools to be built/installed as static and where
 # appropriate to build for the given TARGET.TARGET_ARCH.
 NXBDIRS+= \
     bin/cat \
     bin/chmod \
     bin/cp \
     ${_tcsh} \
     bin/echo \
     bin/expr \
     bin/hostname \
     bin/ln \
     bin/ls \
     bin/mkdir \
     bin/mv \
     bin/ps \
     bin/realpath \
     bin/rm \
     bin/rmdir \
     bin/sh \
     bin/sleep \
     sbin/md5 \
     sbin/sysctl \
     usr.bin/addr2line \
     usr.bin/ar \
     usr.bin/awk \
     usr.bin/basename \
     usr.bin/bmake \
     usr.bin/bzip2 \
     usr.bin/cmp \
     usr.bin/diff \
     usr.bin/dirname \
     usr.bin/objcopy \
     usr.bin/env \
     usr.bin/fetch \
     usr.bin/find \
     usr.bin/grep \
     usr.bin/gzip \
     usr.bin/head \
     usr.bin/id \
     usr.bin/lex \
     usr.bin/limits \
     usr.bin/lorder \
     usr.bin/mandoc \
     usr.bin/mktemp \
     usr.bin/mt \
     usr.bin/nm \
     usr.bin/patch \
     usr.bin/readelf \
     usr.bin/sed \
     usr.bin/size \
     usr.bin/sort \
     usr.bin/strings \
     usr.bin/tar \
     usr.bin/touch \
     usr.bin/tr \
     usr.bin/true \
     usr.bin/uniq \
     usr.bin/unzip \
     usr.bin/wc \
     usr.bin/xargs \
     usr.bin/xinstall \
     usr.bin/xz \
     usr.bin/yacc \
     usr.sbin/chown
 
 SUBDIR_DEPEND_usr.bin/clang=	lib/clang
 .if ${MK_CLANG} != "no"
 NXBDIRS+=	lib/clang
 NXBDIRS+=	usr.bin/clang
 .endif
 .if ${MK_GCC} != "no"
 NXBDIRS+=	gnu/usr.bin/cc
 .endif
 .if ${MK_BINUTILS} != "no"
 NXBDIRS+=	gnu/usr.bin/binutils
 .endif
 # XXX: native-xtools passes along ${NXBDIRS} in SUBDIR_OVERRIDE that needs
 # to be evaluated after NXBDIRS is set.
 .if make(install) && !empty(SUBDIR_OVERRIDE)
 SUBDIR=	${SUBDIR_OVERRIDE}
 .endif
 
 NXBMAKEARGS+= \
 	OBJTOP=${NXBOBJTOP:Q} \
 	OBJROOT=${NXBOBJROOT:Q} \
 	MAKEOBJDIRPREFIX= \
 	-DNO_SHARED \
 	-DNO_CPU_CFLAGS \
 	-DNO_PIC \
 	SSP_CFLAGS= \
 	MK_CASPER=no \
 	MK_CLANG_EXTRAS=no \
 	MK_CLANG_FULL=no \
 	MK_CTF=no \
 	MK_DEBUG_FILES=no \
 	MK_GDB=no \
 	MK_HTML=no \
 	MK_LLDB=no \
 	MK_MAN=no \
 	MK_MAN_UTILS=yes \
 	MK_OFED=no \
 	MK_OPENSSH=no \
 	MK_PROFILE=no \
 	MK_RETPOLINE=no \
 	MK_SENDMAIL=no \
 	MK_SVNLITE=no \
 	MK_TESTS=no \
 	MK_WARNS=no \
 	MK_ZFS=no
 
 .if make(native-xtools*) && \
     (!defined(NXB_TARGET) || !defined(NXB_TARGET_ARCH))
 .error Missing NXB_TARGET / NXB_TARGET_ARCH
 .endif
 # For 'toolchain' we want to produce native binaries that themselves generate
 # native binaries.
 NXBTMAKE=	${NXBMAKEENV} ${MAKE} ${NXBMAKEARGS:N-DNO_PIC:N-DNO_SHARED} \
 		TARGET=${MACHINE} TARGET_ARCH=${MACHINE_ARCH}
 # For 'everything' we want to produce native binaries (hence -target to
 # be MACHINE) that themselves generate TARGET.TARGET_ARCH binaries.
 # TARGET/TARGET_ARCH are still passed along from user.
 #
 # Use the toolchain we create as an external toolchain.
 .if ${USING_SYSTEM_COMPILER} == "yes" || ${XCC:N${CCACHE_BIN}:M/*}
 NXBMAKE+=	XCC="${XCC}" \
 		XCXX="${XCXX}" \
 		XCPP="${XCPP}"
 .else
 NXBMAKE+=	XCC="${NXBOBJTOP}/tmp/usr/bin/cc" \
 		XCXX="${NXBOBJTOP}/tmp/usr/bin/c++" \
 		XCPP="${NXBOBJTOP}/tmp/usr/bin/cpp"
 .endif
 NXBMAKE+=	${NXBMAKEENV} ${MAKE} -f Makefile.inc1 ${NXBMAKEARGS} \
 		TARGET=${NXB_TARGET} TARGET_ARCH=${NXB_TARGET_ARCH} \
 		TARGET_TRIPLE=${MACHINE_TRIPLE:Q}
 # NXBDIRS is improperly based on MACHINE rather than NXB_TARGET.  Need to
 # invoke a sub-make to reevaluate MK_GCC, etc, for NXBDIRS.
 NXBMAKE+=	SUBDIR_OVERRIDE='$${NXBDIRS:M*}'
 # Need to avoid the -isystem logic when using clang as an external toolchain
 # even if the TARGET being built for wants GCC.
 NXBMAKE+=	WANT_COMPILER_TYPE='$${X_COMPILER_TYPE}'
 native-xtools: .PHONY
 	${_+_}cd ${.CURDIR}; ${NXBTMAKE} _cleanobj MK_GCC=yes
 	# Build the bootstrap/host/cross tools that produce native binaries
 	# Pass along MK_GCC=yes to ensure GCC-needed build tools are built.
 	# We don't quite know what the NXB_TARGET wants so just build it.
 	${_+_}cd ${.CURDIR}; ${NXBTMAKE} kernel-toolchain MK_GCC=yes
 	# Populate includes/libraries sysroot that produce native binaries.
 	# This is split out from 'toolchain' above mostly so that target LLVM
 	# libraries have a proper LLVM_DEFAULT_TARGET_TRIPLE without
 	# polluting the cross-compiler build.  The LLVM/GCC libs are skipped
 	# here to avoid the problem but are kept in 'toolchain' so that
 	# needed build tools are built.
 	${_+_}cd ${.CURDIR}; ${NXBTMAKE} _includes MK_CLANG=no MK_GCC=no
 	${_+_}cd ${.CURDIR}; ${NXBTMAKE} _libraries MK_CLANG=no MK_GCC=no
 	# Clean out improper TARGET=MACHINE files
 	${_+_}cd ${.CURDIR}/gnu/usr.bin/cc/cc_tools; ${NXBTMAKE} cleandir
 .if !defined(NO_OBJWALK)
 	${_+_}cd ${.CURDIR}; ${NXBMAKE} _obj
 .endif
 	${_+_}cd ${.CURDIR}; ${NXBMAKE} everything
 	@echo ">> native-xtools done.  Use 'make native-xtools-install' to install to a given DESTDIR"
 
 native-xtools-install: .PHONY
 	mkdir -p ${NXBDESTDIR}/bin ${NXBDESTDIR}/sbin ${NXBDESTDIR}/usr
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \
 	    -p ${NXBDESTDIR}/usr >/dev/null
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \
 	    -p ${NXBDESTDIR}/usr/include >/dev/null
 	${_+_}cd ${.CURDIR}; ${NXBMAKE} \
 	    DESTDIR=${NXBDESTDIR} \
 	    -DNO_ROOT \
 	    install
 
 #
 # hierarchy - ensure that all the needed directories are present
 #
 hierarchy hier: .MAKE .PHONY
 	${_+_}cd ${.CURDIR}/etc; ${HMAKE} distrib-dirs
 
 #
 # libraries - build all libraries, and install them under ${DESTDIR}.
 #
 # The list of libraries with dependents (${_prebuild_libs}) and their
 # interdependencies (__L) are built automatically by the
 # ${.CURDIR}/tools/make_libdeps.sh script.
 #
 libraries: .MAKE .PHONY
 	${_+_}cd ${.CURDIR}; \
 	    ${MAKE} -f Makefile.inc1 _prereq_libs; \
 	    ${MAKE} -f Makefile.inc1 _startup_libs; \
 	    ${MAKE} -f Makefile.inc1 _prebuild_libs; \
 	    ${MAKE} -f Makefile.inc1 _generic_libs
 
 #
 # static libgcc.a prerequisite for shared libc
 #
 _prereq_libs= lib/libcompiler_rt
 .if ${MK_SSP} != "no"
 _prereq_libs+= gnu/lib/libssp/libssp_nonshared
 .endif
 
 # These dependencies are not automatically generated:
 #
 # gnu/lib/csu, gnu/lib/libgcc, lib/csu and lib/libc must be built before
 # all shared libraries for ELF.
 #
 _startup_libs=	lib/csu
 .if ${MK_BSD_CRTBEGIN} == "no"
 _startup_libs+=	gnu/lib/csu
 .endif
 _startup_libs+=	lib/libcompiler_rt
 _startup_libs+=	lib/libc
 _startup_libs+=	lib/libc_nonshared
 .if ${MK_LIBCPLUSPLUS} != "no"
 _startup_libs+=	lib/libcxxrt
 .endif
 
 .if ${MK_LLVM_LIBUNWIND} != "no"
 _prereq_libs+=	lib/libgcc_eh lib/libgcc_s
 _startup_libs+=	lib/libgcc_eh lib/libgcc_s
 
 lib/libgcc_s__L: lib/libc__L
 lib/libgcc_s__L: lib/libc_nonshared__L
 .if ${MK_LIBCPLUSPLUS} != "no"
 lib/libcxxrt__L: lib/libgcc_s__L
 .endif
 
 .else # MK_LLVM_LIBUNWIND == no
 
 _prereq_libs+=	gnu/lib/libgcc
 _startup_libs+=	gnu/lib/libgcc
 
 gnu/lib/libgcc__L: lib/libc__L
 gnu/lib/libgcc__L: lib/libc_nonshared__L
 .if ${MK_LIBCPLUSPLUS} != "no"
 lib/libcxxrt__L: gnu/lib/libgcc__L
 .endif
 .endif
 
 _prebuild_libs=	${_kerberos5_lib_libasn1} \
 		${_kerberos5_lib_libhdb} \
 		${_kerberos5_lib_libheimbase} \
 		${_kerberos5_lib_libheimntlm} \
 		${_libsqlite3} \
 		${_kerberos5_lib_libheimipcc} \
 		${_kerberos5_lib_libhx509} ${_kerberos5_lib_libkrb5} \
 		${_kerberos5_lib_libroken} \
 		${_kerberos5_lib_libwind} \
 		lib/libbz2 ${_libcom_err} lib/libcrypt \
 		lib/libelf lib/libexpat \
 		lib/libfigpar \
 		${_lib_libgssapi} \
 		lib/libkiconv lib/libkvm lib/liblzma lib/libmd lib/libnv \
 		lib/libzstd \
 		${_lib_casper} \
 		lib/ncurses/ncurses lib/ncurses/ncursesw \
 		lib/libopie lib/libpam/libpam ${_lib_libthr} \
 		${_lib_libradius} lib/libsbuf lib/libtacplus \
 		lib/libgeom \
 		${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \
 		${_cddl_lib_libuutil} \
 		${_cddl_lib_libavl} \
 		${_cddl_lib_libzfs_core} ${_cddl_lib_libzfs} \
 		${_cddl_lib_libctf} \
 		lib/libufs \
 		lib/libutil lib/libpjdlog ${_lib_libypclnt} lib/libz lib/msun \
 		${_secure_lib_libcrypto} ${_secure_lib_libssl} \
 		${_lib_libldns} ${_secure_lib_libssh}
 
 .if ${MK_GNUCXX} != "no"
 _prebuild_libs+= gnu/lib/libstdc++ gnu/lib/libsupc++
 gnu/lib/libstdc++__L: lib/msun__L
 gnu/lib/libsupc++__L: gnu/lib/libstdc++__L
 .endif
 
 .if ${MK_DIALOG} != "no"
 _prebuild_libs+= gnu/lib/libdialog
 gnu/lib/libdialog__L: lib/msun__L lib/ncurses/ncursesw__L
 .endif
 
 .if ${MK_LIBCPLUSPLUS} != "no"
 _prebuild_libs+= lib/libc++
 .endif
 
 lib/libgeom__L: lib/libexpat__L
 lib/libkvm__L: lib/libelf__L
 
 .if ${MK_LIBTHR} != "no"
 _lib_libthr=	lib/libthr
 .endif
 
 .if ${MK_RADIUS_SUPPORT} != "no"
 _lib_libradius=	lib/libradius
 .endif
 
 .if ${MK_OFED} != "no"
 _prebuild_libs+= \
 	lib/ofed/libibverbs \
 	lib/ofed/libibmad \
 	lib/ofed/libibumad \
 	lib/ofed/complib \
 	lib/ofed/libmlx5
 
 lib/ofed/libibmad__L:	lib/ofed/libibumad__L
 lib/ofed/complib__L:	lib/libthr__L
 lib/ofed/libmlx5__L:	lib/ofed/libibverbs__L lib/libthr__L
 .endif
 
 .if ${MK_CASPER} != "no"
 _lib_casper=	lib/libcasper
 .endif
 
 lib/libpjdlog__L: lib/libutil__L
 lib/libcasper__L: lib/libnv__L
 lib/liblzma__L: lib/libthr__L
+lib/libzstd__L: lib/libthr__L
 
 _generic_libs=	${_cddl_lib} gnu/lib ${_kerberos5_lib} lib ${_secure_lib} usr.bin/lex/lib
 .if ${MK_IPFILTER} != "no"
 _generic_libs+=	sbin/ipf/libipf
 .endif
 .for _DIR in ${LOCAL_LIB_DIRS}
 .if ${_DIR} == ".WAIT"  || (empty(_generic_libs:M${_DIR}) && exists(${.CURDIR}/${_DIR}/Makefile))
 _generic_libs+= ${_DIR}
 .endif
 .endfor
 
 lib/libopie__L lib/libtacplus__L: lib/libmd__L
 
 .if ${MK_CDDL} != "no"
 _cddl_lib_libumem= cddl/lib/libumem
 _cddl_lib_libnvpair= cddl/lib/libnvpair
 _cddl_lib_libavl= cddl/lib/libavl
 _cddl_lib_libuutil= cddl/lib/libuutil
 .if ${MK_ZFS} != "no"
 _cddl_lib_libzfs_core= cddl/lib/libzfs_core
 _cddl_lib_libzfs= cddl/lib/libzfs
 
 cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L
 
 cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L
 cddl/lib/libzfs__L: lib/libthr__L lib/libmd__L lib/libz__L cddl/lib/libumem__L
 cddl/lib/libzfs__L: cddl/lib/libuutil__L cddl/lib/libavl__L lib/libgeom__L
 
 lib/libbe__L: cddl/lib/libzfs__L
 .endif
 _cddl_lib_libctf= cddl/lib/libctf
 _cddl_lib= cddl/lib
 cddl/lib/libctf__L: lib/libz__L
 .endif
 # cddl/lib/libdtrace requires lib/libproc and lib/librtld_db; it's only built
 # on select architectures though (see cddl/lib/Makefile)
 .if ${MACHINE_CPUARCH} != "sparc64"
 _prebuild_libs+=	lib/libprocstat lib/libproc lib/librtld_db
 lib/libprocstat__L: lib/libelf__L lib/libkvm__L lib/libutil__L
 lib/libproc__L: lib/libprocstat__L
 lib/librtld_db__L: lib/libprocstat__L
 .endif
 
 .if ${MK_CRYPT} != "no"
 .if ${MK_OPENSSL} != "no"
 _secure_lib_libcrypto= secure/lib/libcrypto
 _secure_lib_libssl= secure/lib/libssl
 lib/libradius__L secure/lib/libssl__L: secure/lib/libcrypto__L
 secure/lib/libcrypto__L: lib/libthr__L
 .if ${MK_LDNS} != "no"
 _lib_libldns= lib/libldns
 lib/libldns__L: secure/lib/libssl__L
 .endif
 .if ${MK_OPENSSH} != "no"
 _secure_lib_libssh= secure/lib/libssh
 secure/lib/libssh__L: lib/libz__L secure/lib/libcrypto__L lib/libcrypt__L
 .if ${MK_LDNS} != "no"
 secure/lib/libssh__L: lib/libldns__L
 .endif
 .if ${MK_GSSAPI} != "no" && ${MK_KERBEROS_SUPPORT} != "no"
 secure/lib/libssh__L: lib/libgssapi__L kerberos5/lib/libkrb5__L \
     kerberos5/lib/libhx509__L kerberos5/lib/libasn1__L lib/libcom_err__L \
     lib/libmd__L kerberos5/lib/libroken__L
 .endif
 .endif
 .endif
 _secure_lib=	secure/lib
 .endif
 
 .if ${MK_KERBEROS} != "no"
 kerberos5/lib/libasn1__L: lib/libcom_err__L kerberos5/lib/libroken__L
 kerberos5/lib/libhdb__L: kerberos5/lib/libasn1__L lib/libcom_err__L \
     kerberos5/lib/libkrb5__L kerberos5/lib/libroken__L \
     kerberos5/lib/libwind__L lib/libsqlite3__L
 kerberos5/lib/libheimntlm__L: secure/lib/libcrypto__L kerberos5/lib/libkrb5__L \
     kerberos5/lib/libroken__L lib/libcom_err__L
 kerberos5/lib/libhx509__L: kerberos5/lib/libasn1__L lib/libcom_err__L \
     secure/lib/libcrypto__L kerberos5/lib/libroken__L kerberos5/lib/libwind__L
 kerberos5/lib/libkrb5__L: kerberos5/lib/libasn1__L lib/libcom_err__L \
     lib/libcrypt__L secure/lib/libcrypto__L kerberos5/lib/libhx509__L \
     kerberos5/lib/libroken__L kerberos5/lib/libwind__L \
     kerberos5/lib/libheimbase__L kerberos5/lib/libheimipcc__L
 kerberos5/lib/libroken__L: lib/libcrypt__L
 kerberos5/lib/libwind__L: kerberos5/lib/libroken__L lib/libcom_err__L
 kerberos5/lib/libheimbase__L: lib/libthr__L
 kerberos5/lib/libheimipcc__L: kerberos5/lib/libroken__L kerberos5/lib/libheimbase__L lib/libthr__L
 .endif
 
 lib/libsqlite3__L: lib/libthr__L
 
 .if ${MK_GSSAPI} != "no"
 _lib_libgssapi=	lib/libgssapi
 .endif
 
 .if ${MK_KERBEROS} != "no"
 _kerberos5_lib=	kerberos5/lib
 _kerberos5_lib_libasn1= kerberos5/lib/libasn1
 _kerberos5_lib_libhdb= kerberos5/lib/libhdb
 _kerberos5_lib_libheimbase= kerberos5/lib/libheimbase
 _kerberos5_lib_libkrb5= kerberos5/lib/libkrb5
 _kerberos5_lib_libhx509= kerberos5/lib/libhx509
 _kerberos5_lib_libroken= kerberos5/lib/libroken
 _kerberos5_lib_libheimntlm= kerberos5/lib/libheimntlm
 _libsqlite3= lib/libsqlite3
 _kerberos5_lib_libheimipcc= kerberos5/lib/libheimipcc
 _kerberos5_lib_libwind= kerberos5/lib/libwind
 _libcom_err= lib/libcom_err
 .endif
 
 .if ${MK_NIS} != "no"
 _lib_libypclnt=	lib/libypclnt
 .endif
 
 .if ${MK_OPENSSL} == "no"
 lib/libradius__L: lib/libmd__L
 .endif
 
 lib/libproc__L: \
     ${_cddl_lib_libctf:D${_cddl_lib_libctf}__L} lib/libelf__L lib/librtld_db__L lib/libutil__L
 .if ${MK_CXX} != "no"
 .if ${MK_LIBCPLUSPLUS} != "no"
 lib/libproc__L: lib/libcxxrt__L
 .else # This implies MK_GNUCXX != "no"; see lib/libproc
 lib/libproc__L: gnu/lib/libsupc++__L
 .endif
 .endif
 
 .for _lib in ${_prereq_libs}
 ${_lib}__PL: .PHONY .MAKE
 .if !defined(_MKSHOWCONFIG) && exists(${.CURDIR}/${_lib})
 	${_+_}@${ECHODIR} "===> ${_lib} (obj,all,install)"; \
 		cd ${.CURDIR}/${_lib}; \
 		if [ -z "${NO_OBJWALK}" ]; then ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ obj; fi; \
 		${MAKE} MK_TESTS=no MK_PROFILE=no -DNO_PIC \
 		    DIRPRFX=${_lib}/ all; \
 		${MAKE} MK_TESTS=no MK_PROFILE=no -DNO_PIC \
 		    DIRPRFX=${_lib}/ install
 .endif
 .endfor
 
 .for _lib in ${_startup_libs} ${_prebuild_libs} ${_generic_libs}
 ${_lib}__L: .PHONY .MAKE
 .if !defined(_MKSHOWCONFIG) && exists(${.CURDIR}/${_lib})
 	${_+_}@${ECHODIR} "===> ${_lib} (obj,all,install)"; \
 		cd ${.CURDIR}/${_lib}; \
 		if [ -z "${NO_OBJWALK}" ]; then ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ obj; fi; \
 		${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ all; \
 		${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ install
 .endif
 .endfor
 
 _prereq_libs: ${_prereq_libs:S/$/__PL/}
 _startup_libs: ${_startup_libs:S/$/__L/}
 _prebuild_libs: ${_prebuild_libs:S/$/__L/}
 _generic_libs: ${_generic_libs:S/$/__L/}
 
 # Enable SUBDIR_PARALLEL when not calling 'make all', unless called from
 # 'everything' with _PARALLEL_SUBDIR_OK set.  This is because it is unlikely
 # that running 'make all' from the top-level, especially with a SUBDIR_OVERRIDE
 # or LOCAL_DIRS set, will have a reliable build if SUBDIRs are built in
 # parallel.  This is safe for the world stage of buildworld though since it has
 # already built libraries in a proper order and installed includes into
 # WORLDTMP. Special handling is done for SUBDIR ordering for 'install*' to
 # avoid trashing a system if it crashes mid-install.
 .if !make(all) || defined(_PARALLEL_SUBDIR_OK)
 SUBDIR_PARALLEL=
 .endif
 
 .include <bsd.subdir.mk>
 
 .if make(check-old) || make(check-old-dirs) || \
     make(check-old-files) || make(check-old-libs) || \
     make(delete-old) || make(delete-old-dirs) || \
     make(delete-old-files) || make(delete-old-libs)
 
 #
 # check for / delete old files section
 #
 
 .include "ObsoleteFiles.inc"
 
 OLD_LIBS_MESSAGE="Please be sure no application still uses those libraries, \
 else you can not start such an application. Consult UPDATING for more \
 information regarding how to cope with the removal/revision bump of a \
 specific library."
 
 .if !defined(BATCH_DELETE_OLD_FILES)
 RM_I=-i
 .else
 RM_I=-v
 .endif
 
 delete-old-files: .PHONY
 	@echo ">>> Removing old files (only deletes safe to delete libs)"
 # Ask for every old file if the user really wants to remove it.
 # It's annoying, but better safe than sorry.
 # NB: We cannot pass the list of OLD_FILES as a parameter because the
 # argument list will get too long. Using .for/.endfor make "loops" will make
 # the Makefile parser segfault.
 	@exec 3<&0; \
 	cd ${.CURDIR}; \
 	${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \
 	    -V OLD_FILES -V "OLD_FILES:Musr/share/*.gz:R" | xargs -n1 | sort | \
 	while read file; do \
 		if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \
 			chflags noschg "${DESTDIR}/$${file}" 2>/dev/null || true; \
 			rm ${RM_I} "${DESTDIR}/$${file}" <&3; \
 		fi; \
 		for ext in debug symbols; do \
 		  if ! [ -e "${DESTDIR}/$${file}" ] && [ -f \
 		      "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \
 			  rm ${RM_I} "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" \
 			      <&3; \
 		  fi; \
 		done; \
 	done
 # Remove catpages without corresponding manpages.
 	@exec 3<&0; \
 	find ${DESTDIR}/usr/share/man/cat* ! -type d 2>/dev/null | sort | \
 	sed -ep -e's:${DESTDIR}/usr/share/man/cat:${DESTDIR}/usr/share/man/man:' | \
 	while read catpage; do \
 		read manpage; \
 		if [ ! -e "$${manpage}" ]; then \
 			rm ${RM_I} $${catpage} <&3; \
 	        fi; \
 	done
 	@echo ">>> Old files removed"
 
 check-old-files: .PHONY
 	@echo ">>> Checking for old files"
 	@cd ${.CURDIR}; \
 	${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \
 	    -V OLD_FILES -V "OLD_FILES:Musr/share/*.gz:R" | xargs -n1 | \
 	while read file; do \
 		if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \
 		 	echo "${DESTDIR}/$${file}"; \
 		fi; \
 		for ext in debug symbols; do \
 		  if [ -f "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \
 			  echo "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}"; \
 		  fi; \
 		done; \
 	done | sort
 # Check for catpages without corresponding manpages.
 	@find ${DESTDIR}/usr/share/man/cat* ! -type d 2>/dev/null | \
 	sed -ep -e's:${DESTDIR}/usr/share/man/cat:${DESTDIR}/usr/share/man/man:' | \
 	while read catpage; do \
 		read manpage; \
 		if [ ! -e "$${manpage}" ]; then \
 			echo $${catpage}; \
 	        fi; \
 	done | sort
 
 delete-old-libs: .PHONY
 	@echo ">>> Removing old libraries"
 	@echo "${OLD_LIBS_MESSAGE}" | fmt
 	@exec 3<&0; \
 	cd ${.CURDIR}; \
 	${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \
 	    -V OLD_LIBS | xargs -n1 | sort | \
 	while read file; do \
 		if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \
 			chflags noschg "${DESTDIR}/$${file}" 2>/dev/null || true; \
 			rm ${RM_I} "${DESTDIR}/$${file}" <&3; \
 		fi; \
 		for ext in debug symbols; do \
 		  if ! [ -e "${DESTDIR}/$${file}" ] && [ -f \
 		      "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \
 			  rm ${RM_I} "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" \
 			      <&3; \
 		  fi; \
 		done; \
 	done
 	@echo ">>> Old libraries removed"
 
 check-old-libs: .PHONY
 	@echo ">>> Checking for old libraries"
 	@cd ${.CURDIR}; \
 	${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \
 	    -V OLD_LIBS | xargs -n1 | \
 	while read file; do \
 		if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \
 			echo "${DESTDIR}/$${file}"; \
 		fi; \
 		for ext in debug symbols; do \
 		  if [ -f "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \
 			  echo "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}"; \
 		  fi; \
 		done; \
 	done | sort
 
 delete-old-dirs: .PHONY
 	@echo ">>> Removing old directories"
 	@cd ${.CURDIR}; \
 	${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \
 	    -V OLD_DIRS | xargs -n1 | sort -r | \
 	while read dir; do \
 		if [ -d "${DESTDIR}/$${dir}" ]; then \
 			rmdir -v "${DESTDIR}/$${dir}" || true; \
 		elif [ -L "${DESTDIR}/$${dir}" ]; then \
 			echo "${DESTDIR}/$${dir} is a link, please remove everything manually."; \
 		fi; \
 		if [ -d "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \
 			rmdir -v "${DESTDIR}${DEBUGDIR}/$${dir}" || true; \
 		elif [ -L "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \
 			echo "${DESTDIR}${DEBUGDIR}/$${dir} is a link, please remove everything manually."; \
 		fi; \
 	done
 	@echo ">>> Old directories removed"
 
 check-old-dirs: .PHONY
 	@echo ">>> Checking for old directories"
 	@cd ${.CURDIR}; \
 	${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \
 	    -V OLD_DIRS | xargs -n1 | sort -r | \
 	while read dir; do \
 		if [ -d "${DESTDIR}/$${dir}" ]; then \
 			echo "${DESTDIR}/$${dir}"; \
 		elif [ -L "${DESTDIR}/$${dir}" ]; then \
 			echo "${DESTDIR}/$${dir} is a link, please remove everything manually."; \
 		fi; \
 		if [ -d "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \
 			echo "${DESTDIR}${DEBUGDIR}/$${dir}"; \
 		elif [ -L "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \
 			echo "${DESTDIR}${DEBUGDIR}/$${dir} is a link, please remove everything manually."; \
 		fi; \
 	done
 
 delete-old: delete-old-files delete-old-dirs .PHONY
 	@echo "To remove old libraries run '${MAKE_CMD} delete-old-libs'."
 
 check-old: check-old-files check-old-libs check-old-dirs .PHONY
 	@echo "To remove old files and directories run '${MAKE_CMD} delete-old'."
 	@echo "To remove old libraries run '${MAKE_CMD} delete-old-libs'."
 
 .endif
 
 #
 # showconfig - show build configuration.
 #
 showconfig: .PHONY
 	@(${MAKE} -n -f ${.CURDIR}/sys/conf/kern.opts.mk -V dummy -dg1 UPDATE_DEPENDFILE=no NO_OBJ=yes; \
 	  ${MAKE} -n -f ${.CURDIR}/share/mk/src.opts.mk -V dummy -dg1 UPDATE_DEPENDFILE=no NO_OBJ=yes) 2>&1 | grep ^MK_ | sort -u
 
 .if !empty(KRNLOBJDIR) && !empty(KERNCONF)
 DTBOUTPUTPATH= ${KRNLOBJDIR}/${KERNCONF}/
 
 .if !defined(FDT_DTS_FILE) || empty(FDT_DTS_FILE)
 .if !defined(_MKSHOWCONFIG) && exists(${KERNCONFDIR}/${KERNCONF})
 FDT_DTS_FILE!= awk 'BEGIN {FS="="} /^makeoptions[[:space:]]+FDT_DTS_FILE/ {print $$2}' \
 	'${KERNCONFDIR}/${KERNCONF}' ; echo
 .endif
 .endif
 
 .endif
 
 .if !defined(DTBOUTPUTPATH) || !exists(${DTBOUTPUTPATH})
 DTBOUTPUTPATH= ${.CURDIR}
 .endif
 
 #
 # Build 'standalone' Device Tree Blob
 #
 builddtb: .PHONY
 	@PATH=${TMPPATH} MACHINE=${TARGET} \
 	${.CURDIR}/sys/tools/fdt/make_dtb.sh ${.CURDIR}/sys \
 	    "${FDT_DTS_FILE}" ${DTBOUTPUTPATH}
 
 ###############
 
 # cleanworld
 # In the following, the first 'rm' in a series will usually remove all
 # files and directories.  If it does not, then there are probably some
 # files with file flags set, so this unsets them and tries the 'rm' a
 # second time.  There are situations where this target will be cleaning
 # some directories via more than one method, but that duplication is
 # needed to correctly handle all the possible situations.  Removing all
 # files without file flags set in the first 'rm' instance saves time,
 # because 'chflags' will need to operate on fewer files afterwards.
 #
 # It is expected that BW_CANONICALOBJDIR == the CANONICALOBJDIR as would be
 # created by bsd.obj.mk, except that we don't want to .include that file
 # in this makefile.  We don't do a cleandir walk if MK_AUTO_OBJ is yes
 # since it is not possible for files to land in the wrong place.
 #
 .if make(cleanworld)
 BW_CANONICALOBJDIR:=${OBJTOP}/
 .elif make(cleanuniverse)
 BW_CANONICALOBJDIR:=${OBJROOT}
 .if ${MK_UNIFIED_OBJDIR} == "no"
 .error ${.TARGETS} only supported with WITH_UNIFIED_OBJDIR enabled.
 .endif
 .endif
 cleanworld cleanuniverse: .PHONY
 .if !empty(BW_CANONICALOBJDIR) && exists(${BW_CANONICALOBJDIR}) && \
     ${.CURDIR:tA} != ${BW_CANONICALOBJDIR:tA}
 	-rm -rf ${BW_CANONICALOBJDIR}*
 	-chflags -R 0 ${BW_CANONICALOBJDIR}
 	rm -rf ${BW_CANONICALOBJDIR}*
 .endif
 .if make(cleanworld) && ${MK_AUTO_OBJ} == "no" && \
     (empty(BW_CANONICALOBJDIR) || ${.CURDIR:tA} == ${BW_CANONICALOBJDIR:tA})
 .if ${.CURDIR} == ${.OBJDIR} || ${.CURDIR}/obj == ${.OBJDIR}
 	#   To be safe in this case, fall back to a 'make cleandir'
 	${_+_}@cd ${.CURDIR}; ${MAKE} cleandir
 .endif
 .endif
 
 .if ${TARGET} == ${MACHINE} && ${TARGET_ARCH} == ${MACHINE_ARCH}
 XDEV_CPUTYPE?=${CPUTYPE}
 .else
 XDEV_CPUTYPE?=${TARGET_CPUTYPE}
 .endif
 
 NOFUN=-DNO_FSCHG MK_HTML=no -DNO_LINT \
 	MK_MAN=no MK_NLS=no MK_PROFILE=no \
 	MK_KERBEROS=no MK_RESCUE=no MK_TESTS=no MK_WARNS=no \
 	TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \
 	CPUTYPE=${XDEV_CPUTYPE}
 
 XDDIR=${TARGET_ARCH}-freebsd
 XDTP?=/usr/${XDDIR}
 .if ${XDTP:N/*}
 .error XDTP variable should be an absolute path
 .endif
 
 CDBOBJROOT=	${OBJROOT}${MACHINE}.${MACHINE_ARCH}/xdev/
 CDBOBJTOP=	${CDBOBJROOT}${XDDIR}
 CDBENV= \
 	INSTALL="sh ${.CURDIR}/tools/install.sh"
 CDENV= ${CDBENV} \
 	TOOLS_PREFIX=${XDTP}
 CDMAKEARGS= \
 	OBJTOP=${CDBOBJTOP:Q} \
 	OBJROOT=${CDBOBJROOT:Q}
 CD2MAKEARGS= ${CDMAKEARGS}
 
 .if ${WANT_COMPILER_TYPE} == gcc || \
     (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == gcc)
 # GCC requires -isystem and -L when using a cross-compiler.  --sysroot
 # won't set header path and -L is used to ensure the base library path
 # is added before the port PREFIX library path.
 CD2CFLAGS+=	-isystem ${XDDESTDIR}/usr/include -L${XDDESTDIR}/usr/lib
 # GCC requires -B to find /usr/lib/crti.o when using a cross-compiler
 # combined with --sysroot.
 CD2CFLAGS+=	-B${XDDESTDIR}/usr/lib
 # Force using libc++ for external GCC.
 .if defined(X_COMPILER_TYPE) && \
     ${X_COMPILER_TYPE} == gcc && ${X_COMPILER_VERSION} >= 40800
 CD2CXXFLAGS+=	-isystem ${XDDESTDIR}/usr/include/c++/v1 -std=c++11 \
 		-nostdinc++
 .endif
 .endif
 CD2CFLAGS+=	--sysroot=${XDDESTDIR}/
 CD2ENV=${CDENV} CC="${CC} ${CD2CFLAGS}" CXX="${CXX} ${CD2CXXFLAGS} ${CD2CFLAGS}" \
 	CPP="${CPP} ${CD2CFLAGS}" \
 	MACHINE=${TARGET} MACHINE_ARCH=${TARGET_ARCH}
 
 CDTMP=	${OBJTOP}/${XDDIR}/tmp
 CDMAKE=${CDENV} PATH=${CDTMP}/usr/bin:${PATH} ${MAKE} ${CDMAKEARGS} ${NOFUN}
 CD2MAKE=${CD2ENV} PATH=${CDTMP}/usr/bin:${XDDESTDIR}/usr/bin:${PATH} \
 	${MAKE} ${CD2MAKEARGS} ${NOFUN}
 .if ${MK_META_MODE} != "no"
 # Don't rebuild build-tools targets during normal build.
 CD2MAKE+=	BUILD_TOOLS_META=.NOMETA
 .endif
 XDDESTDIR=${DESTDIR}${XDTP}
 
 .ORDER: xdev-build xdev-install xdev-links
 xdev: xdev-build xdev-install .PHONY
 
 .ORDER: _xb-worldtmp _xb-bootstrap-tools _xb-build-tools _xb-cross-tools
 xdev-build: _xb-worldtmp _xb-bootstrap-tools _xb-build-tools _xb-cross-tools .PHONY
 
 _xb-worldtmp: .PHONY
 	mkdir -p ${CDTMP}/usr
 	${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \
 	    -p ${CDTMP}/usr >/dev/null
 
 _xb-bootstrap-tools: .PHONY
 .for _tool in \
     ${_clang_tblgen} \
     ${_gperf} \
     ${_yacc}
 	${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \
 	cd ${.CURDIR}/${_tool}; \
 	if [ -z "${NO_OBJWALK}" ]; then ${CDMAKE} DIRPRFX=${_tool}/ obj; fi; \
 	${CDMAKE} DIRPRFX=${_tool}/ all; \
 	${CDMAKE} DIRPRFX=${_tool}/ DESTDIR=${CDTMP} install
 .endfor
 
 _xb-build-tools: .PHONY
 	${_+_}@cd ${.CURDIR}; \
 	${CDBENV} ${MAKE} ${CDMAKEARGS} -f Makefile.inc1 ${NOFUN} build-tools
 
 XDEVDIRS= \
     ${_clang_libs} \
     ${_lld} \
     ${_binutils} \
     ${_elftctools} \
     usr.bin/ar \
     ${_clang} \
     ${_gcc}
 
 _xb-cross-tools: .PHONY
 .for _tool in ${XDEVDIRS}
 	${_+_}@${ECHODIR} "===> xdev ${_tool} (obj,all)"; \
 	cd ${.CURDIR}/${_tool}; \
 	if [ -z "${NO_OBJWALK}" ]; then ${CDMAKE} DIRPRFX=${_tool}/ obj; fi; \
 	${CDMAKE} DIRPRFX=${_tool}/ all
 .endfor
 
 _xi-mtree: .PHONY
 	${_+_}@${ECHODIR} "mtree populating ${XDDESTDIR}"
 	mkdir -p ${XDDESTDIR}
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.root.dist \
 	    -p ${XDDESTDIR} >/dev/null
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \
 	    -p ${XDDESTDIR}/usr >/dev/null
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \
 	    -p ${XDDESTDIR}/usr/include >/dev/null
 .if defined(LIBCOMPAT)
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \
 	    -p ${XDDESTDIR}/usr >/dev/null
 .endif
 .if ${MK_TESTS} != "no"
 	mkdir -p ${XDDESTDIR}${TESTSBASE}
 	${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \
 	    -p ${XDDESTDIR}${TESTSBASE} >/dev/null
 .endif
 
 .ORDER: xdev-build _xi-mtree _xi-cross-tools _xi-includes _xi-libraries
 xdev-install: xdev-build _xi-mtree _xi-cross-tools _xi-includes _xi-libraries .PHONY
 
 _xi-cross-tools: .PHONY
 	@echo "_xi-cross-tools"
 .for _tool in ${XDEVDIRS}
 	${_+_}@${ECHODIR} "===> xdev ${_tool} (install)"; \
 	cd ${.CURDIR}/${_tool}; \
 	${CDMAKE} DIRPRFX=${_tool}/ install DESTDIR=${XDDESTDIR}
 .endfor
 
 _xi-includes: .PHONY
 .if !defined(NO_OBJWALK)
 	${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 _obj \
 		DESTDIR=${XDDESTDIR}
 .endif
 	${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 includes \
 		DESTDIR=${XDDESTDIR}
 
 _xi-libraries: .PHONY
 	${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 libraries \
 		DESTDIR=${XDDESTDIR}
 
 xdev-links: .PHONY
 	${_+_}cd ${XDDESTDIR}/usr/bin; \
 	mkdir -p ../../../../usr/bin; \
 		for i in *; do \
 			ln -sf ../../${XDTP}/usr/bin/$$i \
 			    ../../../../usr/bin/${XDDIR}-$$i; \
 			ln -sf ../../${XDTP}/usr/bin/$$i \
 			    ../../../../usr/bin/${XDDIR}${_REVISION}-$$i; \
 		done
Index: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
===================================================================
--- projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c	(revision 352586)
+++ projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c	(revision 352587)
@@ -1,3855 +1,3853 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <stddef.h>
 #include <fcntl.h>
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <pthread.h>
 #include <umem.h>
 #include <time.h>
 
 #include <libzfs.h>
 #include <libzfs_core.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_fletcher.h"
 #include "libzfs_impl.h"
 #include <zlib.h>
 #include <sha2.h>
 #include <sys/zio_checksum.h>
 #include <sys/ddt.h>
 
 #ifdef __FreeBSD__
 extern int zfs_ioctl_version;
 #endif
 
 /* in libzfs_dataset.c */
 extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 /* We need to use something for ENODATA. */
 #define	ENODATA	EIDRM
 
 static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
     recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
     uint64_t *, const char *);
 static int guid_to_name(libzfs_handle_t *, const char *,
     uint64_t, boolean_t, char *);
 
 static const zio_cksum_t zero_cksum = { 0 };
 
 typedef struct dedup_arg {
 	int	inputfd;
 	int	outputfd;
 	libzfs_handle_t  *dedup_hdl;
 } dedup_arg_t;
 
 typedef struct progress_arg {
 	zfs_handle_t *pa_zhp;
 	int pa_fd;
 	boolean_t pa_parsable;
 	boolean_t pa_astitle;
 	uint64_t pa_size;
 } progress_arg_t;
 
 typedef struct dataref {
 	uint64_t ref_guid;
 	uint64_t ref_object;
 	uint64_t ref_offset;
 } dataref_t;
 
 typedef struct dedup_entry {
 	struct dedup_entry	*dde_next;
 	zio_cksum_t dde_chksum;
 	uint64_t dde_prop;
 	dataref_t dde_ref;
 } dedup_entry_t;
 
 #define	MAX_DDT_PHYSMEM_PERCENT		20
 #define	SMALLEST_POSSIBLE_MAX_DDT_MB		128
 
 typedef struct dedup_table {
 	dedup_entry_t	**dedup_hash_array;
 	umem_cache_t	*ddecache;
 	uint64_t	max_ddt_size;  /* max dedup table size in bytes */
 	uint64_t	cur_ddt_size;  /* current dedup table size in bytes */
 	uint64_t	ddt_count;
 	int		numhashbits;
 	boolean_t	ddt_full;
 } dedup_table_t;
 
 static int
 high_order_bit(uint64_t n)
 {
 	int count;
 
 	for (count = 0; n != 0; count++)
 		n >>= 1;
 	return (count);
 }
 
 static size_t
 ssread(void *buf, size_t len, FILE *stream)
 {
 	size_t outlen;
 
 	if ((outlen = fread(buf, len, 1, stream)) == 0)
 		return (0);
 
 	return (outlen);
 }
 
 static void
 ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp,
     zio_cksum_t *cs, uint64_t prop, dataref_t *dr)
 {
 	dedup_entry_t	*dde;
 
 	if (ddt->cur_ddt_size >= ddt->max_ddt_size) {
 		if (ddt->ddt_full == B_FALSE) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "Dedup table full.  Deduplication will continue "
 			    "with existing table entries"));
 			ddt->ddt_full = B_TRUE;
 		}
 		return;
 	}
 
 	if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT))
 	    != NULL) {
 		assert(*ddepp == NULL);
 		dde->dde_next = NULL;
 		dde->dde_chksum = *cs;
 		dde->dde_prop = prop;
 		dde->dde_ref = *dr;
 		*ddepp = dde;
 		ddt->cur_ddt_size += sizeof (dedup_entry_t);
 		ddt->ddt_count++;
 	}
 }
 
 /*
  * Using the specified dedup table, do a lookup for an entry with
  * the checksum cs.  If found, return the block's reference info
  * in *dr. Otherwise, insert a new entry in the dedup table, using
  * the reference information specified by *dr.
  *
  * return value:  true - entry was found
  *		  false - entry was not found
  */
 static boolean_t
 ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs,
     uint64_t prop, dataref_t *dr)
 {
 	uint32_t hashcode;
 	dedup_entry_t **ddepp;
 
 	hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits);
 
 	for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL;
 	    ddepp = &((*ddepp)->dde_next)) {
 		if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) &&
 		    (*ddepp)->dde_prop == prop) {
 			*dr = (*ddepp)->dde_ref;
 			return (B_TRUE);
 		}
 	}
 	ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr);
 	return (B_FALSE);
 }
 
 static int
 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
     zio_cksum_t *zc, int outfd)
 {
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	(void) fletcher_4_incremental_native(drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
 	if (drr->drr_type != DRR_BEGIN) {
 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
 		    drr_checksum.drr_checksum));
 		drr->drr_u.drr_checksum.drr_checksum = *zc;
 	}
 	(void) fletcher_4_incremental_native(
 	    &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc);
 	if (write(outfd, drr, sizeof (*drr)) == -1)
 		return (errno);
 	if (payload_len != 0) {
 		(void) fletcher_4_incremental_native(payload, payload_len, zc);
 		if (write(outfd, payload, payload_len) == -1)
 			return (errno);
 	}
 	return (0);
 }
 
 /*
  * This function is started in a separate thread when the dedup option
  * has been requested.  The main send thread determines the list of
  * snapshots to be included in the send stream and makes the ioctl calls
  * for each one.  But instead of having the ioctl send the output to the
  * the output fd specified by the caller of zfs_send()), the
  * ioctl is told to direct the output to a pipe, which is read by the
  * alternate thread running THIS function.  This function does the
  * dedup'ing by:
  *  1. building a dedup table (the DDT)
  *  2. doing checksums on each data block and inserting a record in the DDT
  *  3. looking for matching checksums, and
  *  4.  sending a DRR_WRITE_BYREF record instead of a write record whenever
  *      a duplicate block is found.
  * The output of this function then goes to the output fd requested
  * by the caller of zfs_send().
  */
 static void *
 cksummer(void *arg)
 {
 	dedup_arg_t *dda = arg;
 	char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
 	dmu_replay_record_t thedrr;
 	dmu_replay_record_t *drr = &thedrr;
 	FILE *ofp;
 	int outfd;
 	dedup_table_t ddt;
 	zio_cksum_t stream_cksum;
 	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
 	uint64_t numbuckets;
 
 	ddt.max_ddt_size =
 	    MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100,
 	    SMALLEST_POSSIBLE_MAX_DDT_MB << 20);
 
 	numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t));
 
 	/*
 	 * numbuckets must be a power of 2.  Increase number to
 	 * a power of 2 if necessary.
 	 */
 	if (!ISP2(numbuckets))
 		numbuckets = 1 << high_order_bit(numbuckets);
 
 	ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *));
 	ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0,
 	    NULL, NULL, NULL, NULL, NULL, 0);
 	ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *);
 	ddt.numhashbits = high_order_bit(numbuckets) - 1;
 	ddt.ddt_full = B_FALSE;
 
 	outfd = dda->outputfd;
 	ofp = fdopen(dda->inputfd, "r");
 	while (ssread(drr, sizeof (*drr), ofp) != 0) {
 
 		/*
 		 * kernel filled in checksum, we are going to write same
 		 * record, but need to regenerate checksum.
 		 */
 		if (drr->drr_type != DRR_BEGIN) {
 			bzero(&drr->drr_u.drr_checksum.drr_checksum,
 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
 		}
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
 		{
 			struct drr_begin *drrb = &drr->drr_u.drr_begin;
 			int fflags;
 			int sz = 0;
 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
 
 			ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 
 			/* set the DEDUP feature flag for this stream */
 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 			fflags |= (DMU_BACKUP_FEATURE_DEDUP |
 			    DMU_BACKUP_FEATURE_DEDUPPROPS);
 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
 
 			if (drr->drr_payloadlen != 0) {
 				sz = drr->drr_payloadlen;
 
 				if (sz > SPA_MAXBLOCKSIZE) {
 					buf = zfs_realloc(dda->dedup_hdl, buf,
 					    SPA_MAXBLOCKSIZE, sz);
 				}
 				(void) ssread(buf, sz, ofp);
 				if (ferror(stdin))
 					perror("fread");
 			}
 			if (dump_record(drr, buf, sz, &stream_cksum,
 			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_END:
 		{
 			struct drr_end *drre = &drr->drr_u.drr_end;
 			/* use the recalculated checksum */
 			drre->drr_checksum = stream_cksum;
 			if (dump_record(drr, NULL, 0, &stream_cksum,
 			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_OBJECT:
 		{
 			struct drr_object *drro = &drr->drr_u.drr_object;
 			if (drro->drr_bonuslen > 0) {
 				(void) ssread(buf,
 				    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
 				    ofp);
 			}
 			if (dump_record(drr, buf,
 			    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
 			    &stream_cksum, outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_SPILL:
 		{
 			struct drr_spill *drrs = &drr->drr_u.drr_spill;
 			(void) ssread(buf, drrs->drr_length, ofp);
 			if (dump_record(drr, buf, drrs->drr_length,
 			    &stream_cksum, outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_FREEOBJECTS:
 		{
 			if (dump_record(drr, NULL, 0, &stream_cksum,
 			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_WRITE:
 		{
 			struct drr_write *drrw = &drr->drr_u.drr_write;
 			dataref_t	dataref;
 			uint64_t	payload_size;
 
 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
 			(void) ssread(buf, payload_size, ofp);
 
 			/*
 			 * Use the existing checksum if it's dedup-capable,
 			 * else calculate a SHA256 checksum for it.
 			 */
 
 			if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
 			    zero_cksum) ||
 			    !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
 				SHA256_CTX	ctx;
 				zio_cksum_t	tmpsha256;
 
 				SHA256Init(&ctx);
 				SHA256Update(&ctx, buf, payload_size);
 				SHA256Final(&tmpsha256, &ctx);
 				drrw->drr_key.ddk_cksum.zc_word[0] =
 				    BE_64(tmpsha256.zc_word[0]);
 				drrw->drr_key.ddk_cksum.zc_word[1] =
 				    BE_64(tmpsha256.zc_word[1]);
 				drrw->drr_key.ddk_cksum.zc_word[2] =
 				    BE_64(tmpsha256.zc_word[2]);
 				drrw->drr_key.ddk_cksum.zc_word[3] =
 				    BE_64(tmpsha256.zc_word[3]);
 				drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256;
 				drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP;
 			}
 
 			dataref.ref_guid = drrw->drr_toguid;
 			dataref.ref_object = drrw->drr_object;
 			dataref.ref_offset = drrw->drr_offset;
 
 			if (ddt_update(dda->dedup_hdl, &ddt,
 			    &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
 			    &dataref)) {
 				dmu_replay_record_t wbr_drr = {0};
 				struct drr_write_byref *wbr_drrr =
 				    &wbr_drr.drr_u.drr_write_byref;
 
 				/* block already present in stream */
 				wbr_drr.drr_type = DRR_WRITE_BYREF;
 
 				wbr_drrr->drr_object = drrw->drr_object;
 				wbr_drrr->drr_offset = drrw->drr_offset;
 				wbr_drrr->drr_length = drrw->drr_logical_size;
 				wbr_drrr->drr_toguid = drrw->drr_toguid;
 				wbr_drrr->drr_refguid = dataref.ref_guid;
 				wbr_drrr->drr_refobject =
 				    dataref.ref_object;
 				wbr_drrr->drr_refoffset =
 				    dataref.ref_offset;
 
 				wbr_drrr->drr_checksumtype =
 				    drrw->drr_checksumtype;
 				wbr_drrr->drr_checksumflags =
 				    drrw->drr_checksumtype;
 				wbr_drrr->drr_key.ddk_cksum =
 				    drrw->drr_key.ddk_cksum;
 				wbr_drrr->drr_key.ddk_prop =
 				    drrw->drr_key.ddk_prop;
 
 				if (dump_record(&wbr_drr, NULL, 0,
 				    &stream_cksum, outfd) != 0)
 					goto out;
 			} else {
 				/* block not previously seen */
 				if (dump_record(drr, buf, payload_size,
 				    &stream_cksum, outfd) != 0)
 					goto out;
 			}
 			break;
 		}
 
 		case DRR_WRITE_EMBEDDED:
 		{
 			struct drr_write_embedded *drrwe =
 			    &drr->drr_u.drr_write_embedded;
 			(void) ssread(buf,
 			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
 			if (dump_record(drr, buf,
 			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
 			    &stream_cksum, outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_FREE:
 		{
 			if (dump_record(drr, NULL, 0, &stream_cksum,
 			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		default:
 			(void) fprintf(stderr, "INVALID record type 0x%x\n",
 			    drr->drr_type);
 			/* should never happen, so assert */
 			assert(B_FALSE);
 		}
 	}
 out:
 	umem_cache_destroy(ddt.ddecache);
 	free(ddt.dedup_hash_array);
 	free(buf);
 	(void) fclose(ofp);
 
 	return (NULL);
 }
 
 /*
  * Routines for dealing with the AVL tree of fs-nvlists
  */
 typedef struct fsavl_node {
 	avl_node_t fn_node;
 	nvlist_t *fn_nvfs;
 	char *fn_snapname;
 	uint64_t fn_guid;
 } fsavl_node_t;
 
 static int
 fsavl_compare(const void *arg1, const void *arg2)
 {
 	const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1;
 	const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2;
 
 	return (AVL_CMP(fn1->fn_guid, fn2->fn_guid));
 }
 
 /*
  * Given the GUID of a snapshot, find its containing filesystem and
  * (optionally) name.
  */
 static nvlist_t *
 fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname)
 {
 	fsavl_node_t fn_find;
 	fsavl_node_t *fn;
 
 	fn_find.fn_guid = snapguid;
 
 	fn = avl_find(avl, &fn_find, NULL);
 	if (fn) {
 		if (snapname)
 			*snapname = fn->fn_snapname;
 		return (fn->fn_nvfs);
 	}
 	return (NULL);
 }
 
 static void
 fsavl_destroy(avl_tree_t *avl)
 {
 	fsavl_node_t *fn;
 	void *cookie;
 
 	if (avl == NULL)
 		return;
 
 	cookie = NULL;
 	while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL)
 		free(fn);
 	avl_destroy(avl);
 	free(avl);
 }
 
 /*
  * Given an nvlist, produce an avl tree of snapshots, ordered by guid
  */
 static avl_tree_t *
 fsavl_create(nvlist_t *fss)
 {
 	avl_tree_t *fsavl;
 	nvpair_t *fselem = NULL;
 
 	if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL)
 		return (NULL);
 
 	avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t),
 	    offsetof(fsavl_node_t, fn_node));
 
 	while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) {
 		nvlist_t *nvfs, *snaps;
 		nvpair_t *snapelem = NULL;
 
 		VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
 		VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
 
 		while ((snapelem =
 		    nvlist_next_nvpair(snaps, snapelem)) != NULL) {
 			fsavl_node_t *fn;
 			uint64_t guid;
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &guid));
 			if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) {
 				fsavl_destroy(fsavl);
 				return (NULL);
 			}
 			fn->fn_nvfs = nvfs;
 			fn->fn_snapname = nvpair_name(snapelem);
 			fn->fn_guid = guid;
 
 			/*
 			 * Note: if there are multiple snaps with the
 			 * same GUID, we ignore all but one.
 			 */
 			if (avl_find(fsavl, fn, NULL) == NULL)
 				avl_add(fsavl, fn);
 			else
 				free(fn);
 		}
 	}
 
 	return (fsavl);
 }
 
 /*
  * Routines for dealing with the giant nvlist of fs-nvlists, etc.
  */
 typedef struct send_data {
 	/*
 	 * assigned inside every recursive call,
 	 * restored from *_save on return:
 	 *
 	 * guid of fromsnap snapshot in parent dataset
 	 * txg of fromsnap snapshot in current dataset
 	 * txg of tosnap snapshot in current dataset
 	 */
 
 	uint64_t parent_fromsnap_guid;
 	uint64_t fromsnap_txg;
 	uint64_t tosnap_txg;
 
 	/* the nvlists get accumulated during depth-first traversal */
 	nvlist_t *parent_snaps;
 	nvlist_t *fss;
 	nvlist_t *snapprops;
 
 	/* send-receive configuration, does not change during traversal */
 	const char *fsname;
 	const char *fromsnap;
 	const char *tosnap;
 	boolean_t recursive;
 	boolean_t verbose;
 
 	/*
 	 * The header nvlist is of the following format:
 	 * {
 	 *   "tosnap" -> string
 	 *   "fromsnap" -> string (if incremental)
 	 *   "fss" -> {
 	 *	id -> {
 	 *
 	 *	 "name" -> string (full name; for debugging)
 	 *	 "parentfromsnap" -> number (guid of fromsnap in parent)
 	 *
 	 *	 "props" -> { name -> value (only if set here) }
 	 *	 "snaps" -> { name (lastname) -> number (guid) }
 	 *	 "snapprops" -> { name (lastname) -> { name -> value } }
 	 *
 	 *	 "origin" -> number (guid) (if clone)
 	 *	 "sent" -> boolean (not on-disk)
 	 *	}
 	 *   }
 	 * }
 	 *
 	 */
 } send_data_t;
 
 static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv);
 
 static int
 send_iterate_snap(zfs_handle_t *zhp, void *arg)
 {
 	send_data_t *sd = arg;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
 	uint64_t txg = zhp->zfs_dmustats.dds_creation_txg;
 	char *snapname;
 	nvlist_t *nv;
 
 	snapname = strrchr(zhp->zfs_name, '@')+1;
 
 	if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) {
 		if (sd->verbose) {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "skipping snapshot %s because it was created "
 			    "after the destination snapshot (%s)\n"),
 			    zhp->zfs_name, sd->tosnap);
 		}
 		zfs_close(zhp);
 		return (0);
 	}
 
 	VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid));
 	/*
 	 * NB: if there is no fromsnap here (it's a newly created fs in
 	 * an incremental replication), we will substitute the tosnap.
 	 */
 	if ((sd->fromsnap && strcmp(snapname, sd->fromsnap) == 0) ||
 	    (sd->parent_fromsnap_guid == 0 && sd->tosnap &&
 	    strcmp(snapname, sd->tosnap) == 0)) {
 		sd->parent_fromsnap_guid = guid;
 	}
 
 	VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
 	send_iterate_prop(zhp, nv);
 	VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv));
 	nvlist_free(nv);
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static void
 send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
 {
 	nvpair_t *elem = NULL;
 
 	while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) {
 		char *propname = nvpair_name(elem);
 		zfs_prop_t prop = zfs_name_to_prop(propname);
 		nvlist_t *propnv;
 
 		if (!zfs_prop_user(propname)) {
 			/*
 			 * Realistically, this should never happen.  However,
 			 * we want the ability to add DSL properties without
 			 * needing to make incompatible version changes.  We
 			 * need to ignore unknown properties to allow older
 			 * software to still send datasets containing these
 			 * properties, with the unknown properties elided.
 			 */
 			if (prop == ZPROP_INVAL)
 				continue;
 
 			if (zfs_prop_readonly(prop))
 				continue;
 		}
 
 		verify(nvpair_value_nvlist(elem, &propnv) == 0);
 		if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
 		    prop == ZFS_PROP_REFQUOTA ||
 		    prop == ZFS_PROP_REFRESERVATION) {
 			char *source;
 			uint64_t value;
 			verify(nvlist_lookup_uint64(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
 				continue;
 			/*
 			 * May have no source before SPA_VERSION_RECVD_PROPS,
 			 * but is still modifiable.
 			 */
 			if (nvlist_lookup_string(propnv,
 			    ZPROP_SOURCE, &source) == 0) {
 				if ((strcmp(source, zhp->zfs_name) != 0) &&
 				    (strcmp(source,
 				    ZPROP_SOURCE_VAL_RECVD) != 0))
 					continue;
 			}
 		} else {
 			char *source;
 			if (nvlist_lookup_string(propnv,
 			    ZPROP_SOURCE, &source) != 0)
 				continue;
 			if ((strcmp(source, zhp->zfs_name) != 0) &&
 			    (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0))
 				continue;
 		}
 
 		if (zfs_prop_user(propname) ||
 		    zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
 			char *value;
 			verify(nvlist_lookup_string(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			VERIFY(0 == nvlist_add_string(nv, propname, value));
 		} else {
 			uint64_t value;
 			verify(nvlist_lookup_uint64(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			VERIFY(0 == nvlist_add_uint64(nv, propname, value));
 		}
 	}
 }
 
 /*
  * returns snapshot creation txg
  * and returns 0 if the snapshot does not exist
  */
 static uint64_t
 get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	uint64_t txg = 0;
 
 	if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0')
 		return (txg);
 
 	(void) snprintf(name, sizeof (name), "%s@%s", fs, snap);
 	if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) {
 		zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT);
 		if (zhp != NULL) {
 			txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG);
 			zfs_close(zhp);
 		}
 	}
 
 	return (txg);
 }
 
 /*
  * recursively generate nvlists describing datasets.  See comment
  * for the data structure send_data_t above for description of contents
  * of the nvlist.
  */
 static int
 send_iterate_fs(zfs_handle_t *zhp, void *arg)
 {
 	send_data_t *sd = arg;
 	nvlist_t *nvfs, *nv;
 	int rv = 0;
 	uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
 	uint64_t fromsnap_txg_save = sd->fromsnap_txg;
 	uint64_t tosnap_txg_save = sd->tosnap_txg;
 	uint64_t txg = zhp->zfs_dmustats.dds_creation_txg;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
 	uint64_t fromsnap_txg, tosnap_txg;
 	char guidstring[64];
 
 	fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap);
 	if (fromsnap_txg != 0)
 		sd->fromsnap_txg = fromsnap_txg;
 
 	tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap);
 	if (tosnap_txg != 0)
 		sd->tosnap_txg = tosnap_txg;
 
 	/*
 	 * on the send side, if the current dataset does not have tosnap,
 	 * perform two additional checks:
 	 *
 	 * - skip sending the current dataset if it was created later than
 	 *   the parent tosnap
 	 * - return error if the current dataset was created earlier than
 	 *   the parent tosnap
 	 */
 	if (sd->tosnap != NULL && tosnap_txg == 0) {
 		if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) {
 			if (sd->verbose) {
 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 				    "skipping dataset %s: snapshot %s does "
 				    "not exist\n"), zhp->zfs_name, sd->tosnap);
 			}
 		} else {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "cannot send %s@%s%s: snapshot %s@%s does not "
 			    "exist\n"), sd->fsname, sd->tosnap, sd->recursive ?
 			    dgettext(TEXT_DOMAIN, " recursively") : "",
 			    zhp->zfs_name, sd->tosnap);
 			rv = -1;
 		}
 		goto out;
 	}
 
 	VERIFY(0 == nvlist_alloc(&nvfs, NV_UNIQUE_NAME, 0));
 	VERIFY(0 == nvlist_add_string(nvfs, "name", zhp->zfs_name));
 	VERIFY(0 == nvlist_add_uint64(nvfs, "parentfromsnap",
 	    sd->parent_fromsnap_guid));
 
 	if (zhp->zfs_dmustats.dds_origin[0]) {
 		zfs_handle_t *origin = zfs_open(zhp->zfs_hdl,
 		    zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
 		if (origin == NULL) {
 			rv = -1;
 			goto out;
 		}
 		VERIFY(0 == nvlist_add_uint64(nvfs, "origin",
 		    origin->zfs_dmustats.dds_guid));
 	}
 
 	/* iterate over props */
 	VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
 	send_iterate_prop(zhp, nv);
 	VERIFY(0 == nvlist_add_nvlist(nvfs, "props", nv));
 	nvlist_free(nv);
 
 	/* iterate over snaps, and set sd->parent_fromsnap_guid */
 	sd->parent_fromsnap_guid = 0;
 	VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0));
 	VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0));
 	(void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd);
 	VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps));
 	VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops));
 	nvlist_free(sd->parent_snaps);
 	nvlist_free(sd->snapprops);
 
 	/* add this fs to nvlist */
 	(void) snprintf(guidstring, sizeof (guidstring),
 	    "0x%llx", (longlong_t)guid);
 	VERIFY(0 == nvlist_add_nvlist(sd->fss, guidstring, nvfs));
 	nvlist_free(nvfs);
 
 	/* iterate over children */
 	if (sd->recursive)
 		rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
 
 out:
 	sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
 	sd->fromsnap_txg = fromsnap_txg_save;
 	sd->tosnap_txg = tosnap_txg_save;
 
 	zfs_close(zhp);
 	return (rv);
 }
 
 static int
 gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
     const char *tosnap, boolean_t recursive, boolean_t verbose,
     nvlist_t **nvlp, avl_tree_t **avlp)
 {
 	zfs_handle_t *zhp;
 	send_data_t sd = { 0 };
 	int error;
 
 	zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return (EZFS_BADTYPE);
 
 	VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
 	sd.fsname = fsname;
 	sd.fromsnap = fromsnap;
 	sd.tosnap = tosnap;
 	sd.recursive = recursive;
 	sd.verbose = verbose;
 
 	if ((error = send_iterate_fs(zhp, &sd)) != 0) {
 		nvlist_free(sd.fss);
 		if (avlp != NULL)
 			*avlp = NULL;
 		*nvlp = NULL;
 		return (error);
 	}
 
 	if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) {
 		nvlist_free(sd.fss);
 		*nvlp = NULL;
 		return (EZFS_NOMEM);
 	}
 
 	*nvlp = sd.fss;
 	return (0);
 }
 
 /*
  * Routines specific to "zfs send"
  */
 typedef struct send_dump_data {
 	/* these are all just the short snapname (the part after the @) */
 	const char *fromsnap;
 	const char *tosnap;
 	char prevsnap[ZFS_MAX_DATASET_NAME_LEN];
 	uint64_t prevsnap_obj;
 	boolean_t seenfrom, seento, replicate, doall, fromorigin;
 	boolean_t verbose, dryrun, parsable, progress, embed_data, std_out;
 	boolean_t progressastitle;
 	boolean_t large_block, compress;
 	int outfd;
 	boolean_t err;
 	nvlist_t *fss;
 	nvlist_t *snapholds;
 	avl_tree_t *fsavl;
 	snapfilter_cb_t *filter_cb;
 	void *filter_cb_arg;
 	nvlist_t *debugnv;
 	char holdtag[ZFS_MAX_DATASET_NAME_LEN];
 	int cleanup_fd;
 	uint64_t size;
 } send_dump_data_t;
 
 static int
 estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
     boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 	assert(fromsnap_obj == 0 || !fromorigin);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_obj = fromorigin;
 	zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zc.zc_fromobj = fromsnap_obj;
 	zc.zc_guid = 1;  /* estimate flag */
 	zc.zc_flags = flags;
 
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot estimate space for '%s'"), zhp->zfs_name);
 
 		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
 			if (zfs_dataset_exists(hdl, zc.zc_name,
 			    ZFS_TYPE_SNAPSHOT)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (@%s) does not exist"),
 				    zc.zc_value);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	*sizep = zc.zc_objset_type;
 
 	return (0);
 }
 
 /*
  * Dumps a backup of the given snapshot (incremental from fromsnap if it's not
  * NULL) to the file descriptor specified by outfd.
  */
 static int
 dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
     boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
     nvlist_t *debugnv)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *thisdbg;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 	assert(fromsnap_obj == 0 || !fromorigin);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_cookie = outfd;
 	zc.zc_obj = fromorigin;
 	zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zc.zc_fromobj = fromsnap_obj;
 	zc.zc_flags = flags;
 
 	VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
 	if (fromsnap && fromsnap[0] != '\0') {
 		VERIFY(0 == nvlist_add_string(thisdbg,
 		    "fromsnap", fromsnap));
 	}
 
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot send '%s'"), zhp->zfs_name);
 
 		VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno));
 		if (debugnv) {
 			VERIFY(0 == nvlist_add_nvlist(debugnv,
 			    zhp->zfs_name, thisdbg));
 		}
 		nvlist_free(thisdbg);
 
 		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
 			if (zfs_dataset_exists(hdl, zc.zc_name,
 			    ZFS_TYPE_SNAPSHOT)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (@%s) does not exist"),
 				    zc.zc_value);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 #ifdef illumos
 		case ENOSTR:
 #endif
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	if (debugnv)
 		VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
 	nvlist_free(thisdbg);
 
 	return (0);
 }
 
 static void
 gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd)
 {
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	/*
 	 * zfs_send() only sets snapholds for sends that need them,
 	 * e.g. replication and doall.
 	 */
 	if (sdd->snapholds == NULL)
 		return;
 
 	fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag);
 }
 
 static void *
 send_progress_thread(void *arg)
 {
 	progress_arg_t *pa = arg;
 	zfs_cmd_t zc = { 0 };
 	zfs_handle_t *zhp = pa->pa_zhp;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	unsigned long long bytes, total;
 	char buf[16];
 	time_t t;
 	struct tm *tm;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (!pa->pa_parsable && !pa->pa_astitle)
 		(void) fprintf(stderr, "TIME        SENT   SNAPSHOT\n");
 
 	/*
 	 * Print the progress from ZFS_IOC_SEND_PROGRESS every second.
 	 */
 	for (;;) {
 		(void) sleep(1);
 
 		zc.zc_cookie = pa->pa_fd;
 		if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0)
 			return ((void *)-1);
 
 		(void) time(&t);
 		tm = localtime(&t);
 		bytes = zc.zc_cookie;
 
 		if (pa->pa_astitle) {
 			int pct;
 			if (pa->pa_size > bytes)
 				pct = 100 * bytes / pa->pa_size;
 			else
 				pct = 100;
 
 			setproctitle("sending %s (%d%%: %llu/%llu)",
 			    zhp->zfs_name, pct, bytes, pa->pa_size);
 		} else if (pa->pa_parsable) {
 			(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
 			    tm->tm_hour, tm->tm_min, tm->tm_sec,
 			    bytes, zhp->zfs_name);
 		} else {
 			zfs_nicenum(bytes, buf, sizeof (buf));
 			(void) fprintf(stderr, "%02d:%02d:%02d   %5s   %s\n",
 			    tm->tm_hour, tm->tm_min, tm->tm_sec,
 			    buf, zhp->zfs_name);
 		}
 	}
 }
 
 static void
 send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap,
     uint64_t size, boolean_t parsable)
 {
 	if (parsable) {
 		if (fromsnap != NULL) {
 			(void) fprintf(fout, "incremental\t%s\t%s",
 			    fromsnap, tosnap);
 		} else {
 			(void) fprintf(fout, "full\t%s",
 			    tosnap);
 		}
 	} else {
 		if (fromsnap != NULL) {
 			if (strchr(fromsnap, '@') == NULL &&
 			    strchr(fromsnap, '#') == NULL) {
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "send from @%s to %s"),
 				    fromsnap, tosnap);
 			} else {
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "send from %s to %s"),
 				    fromsnap, tosnap);
 			}
 		} else {
 			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 			    "full send of %s"),
 			    tosnap);
 		}
 	}
 
 	if (parsable) {
 		(void) fprintf(fout, "\t%llu",
 		    (longlong_t)size);
 	} else if (size != 0) {
 		char buf[16];
 		zfs_nicenum(size, buf, sizeof (buf));
 		(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 		    " estimated size is %s"), buf);
 	}
 	(void) fprintf(fout, "\n");
 }
 
 static int
 dump_snapshot(zfs_handle_t *zhp, void *arg)
 {
 	send_dump_data_t *sdd = arg;
 	progress_arg_t pa = { 0 };
 	pthread_t tid;
 	char *thissnap;
 	enum lzc_send_flags flags = 0;
 	int err;
 	boolean_t isfromsnap, istosnap, fromorigin;
 	boolean_t exclude = B_FALSE;
 	FILE *fout = sdd->std_out ? stdout : stderr;
 	uint64_t size = 0;
 
 	err = 0;
 	thissnap = strchr(zhp->zfs_name, '@') + 1;
 	isfromsnap = (sdd->fromsnap != NULL &&
 	    strcmp(sdd->fromsnap, thissnap) == 0);
 
 	if (!sdd->seenfrom && isfromsnap) {
 		gather_holds(zhp, sdd);
 		sdd->seenfrom = B_TRUE;
 		(void) strcpy(sdd->prevsnap, thissnap);
 		sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (sdd->seento || !sdd->seenfrom) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
 	if (istosnap)
 		sdd->seento = B_TRUE;
 
 	if (sdd->large_block)
 		flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 	if (sdd->embed_data)
 		flags |= LZC_SEND_FLAG_EMBED_DATA;
 	if (sdd->compress)
 		flags |= LZC_SEND_FLAG_COMPRESS;
 
 	if (!sdd->doall && !isfromsnap && !istosnap) {
 		if (sdd->replicate) {
 			char *snapname;
 			nvlist_t *snapprops;
 			/*
 			 * Filter out all intermediate snapshots except origin
 			 * snapshots needed to replicate clones.
 			 */
 			nvlist_t *nvfs = fsavl_find(sdd->fsavl,
 			    zhp->zfs_dmustats.dds_guid, &snapname);
 
 			VERIFY(0 == nvlist_lookup_nvlist(nvfs,
 			    "snapprops", &snapprops));
 			VERIFY(0 == nvlist_lookup_nvlist(snapprops,
 			    thissnap, &snapprops));
 			exclude = !nvlist_exists(snapprops, "is_clone_origin");
 		} else {
 			exclude = B_TRUE;
 		}
 	}
 
 	/*
 	 * If a filter function exists, call it to determine whether
 	 * this snapshot will be sent.
 	 */
 	if (exclude || (sdd->filter_cb != NULL &&
 	    sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
 		/*
 		 * This snapshot is filtered out.  Don't send it, and don't
 		 * set prevsnap_obj, so it will be as if this snapshot didn't
 		 * exist, and the next accepted snapshot will be sent as
 		 * an incremental from the last accepted one, or as the
 		 * first (and full) snapshot in the case of a replication,
 		 * non-incremental send.
 		 */
 		zfs_close(zhp);
 		return (0);
 	}
 
 	gather_holds(zhp, sdd);
 	fromorigin = sdd->prevsnap[0] == '\0' &&
 	    (sdd->fromorigin || sdd->replicate);
 
-	if (sdd->progress && sdd->dryrun) {
+	if (sdd->verbose || sdd->progress) {
 		(void) estimate_ioctl(zhp, sdd->prevsnap_obj,
 		    fromorigin, flags, &size);
 		sdd->size += size;
-	}
 
-	if (sdd->verbose) {
 		send_print_verbose(fout, zhp->zfs_name,
 		    sdd->prevsnap[0] ? sdd->prevsnap : NULL,
 		    size, sdd->parsable);
 	}
 
 	if (!sdd->dryrun) {
 		/*
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
 		if (sdd->progress) {
 			pa.pa_zhp = zhp;
 			pa.pa_fd = sdd->outfd;
 			pa.pa_parsable = sdd->parsable;
 			pa.pa_size = sdd->size;
 			pa.pa_astitle = sdd->progressastitle;
 
 			if ((err = pthread_create(&tid, NULL,
 			    send_progress_thread, &pa)) != 0) {
 				zfs_close(zhp);
 				return (err);
 			}
 		}
 
 		err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
 		    fromorigin, sdd->outfd, flags, sdd->debugnv);
 
 		if (sdd->progress) {
 			(void) pthread_cancel(tid);
 			(void) pthread_join(tid, NULL);
 		}
 	}
 
 	(void) strcpy(sdd->prevsnap, thissnap);
 	sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zfs_close(zhp);
 	return (err);
 }
 
 static int
 dump_filesystem(zfs_handle_t *zhp, void *arg)
 {
 	int rv = 0;
 	send_dump_data_t *sdd = arg;
 	boolean_t missingfrom = B_FALSE;
 	zfs_cmd_t zc = { 0 };
 
 	(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
 	    zhp->zfs_name, sdd->tosnap);
 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "WARNING: could not send %s@%s: does not exist\n"),
 		    zhp->zfs_name, sdd->tosnap);
 		sdd->err = B_TRUE;
 		return (0);
 	}
 
 	if (sdd->replicate && sdd->fromsnap) {
 		/*
 		 * If this fs does not have fromsnap, and we're doing
 		 * recursive, we need to send a full stream from the
 		 * beginning (or an incremental from the origin if this
 		 * is a clone).  If we're doing non-recursive, then let
 		 * them get the error.
 		 */
 		(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
 		    zhp->zfs_name, sdd->fromsnap);
 		if (ioctl(zhp->zfs_hdl->libzfs_fd,
 		    ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 			missingfrom = B_TRUE;
 		}
 	}
 
 	sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
 	sdd->prevsnap_obj = 0;
 	if (sdd->fromsnap == NULL || missingfrom)
 		sdd->seenfrom = B_TRUE;
 
 	rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
 	if (!sdd->seenfrom) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "WARNING: could not send %s@%s:\n"
 		    "incremental source (%s@%s) does not exist\n"),
 		    zhp->zfs_name, sdd->tosnap,
 		    zhp->zfs_name, sdd->fromsnap);
 		sdd->err = B_TRUE;
 	} else if (!sdd->seento) {
 		if (sdd->fromsnap) {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "WARNING: could not send %s@%s:\n"
 			    "incremental source (%s@%s) "
 			    "is not earlier than it\n"),
 			    zhp->zfs_name, sdd->tosnap,
 			    zhp->zfs_name, sdd->fromsnap);
 		} else {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "WARNING: "
 			    "could not send %s@%s: does not exist\n"),
 			    zhp->zfs_name, sdd->tosnap);
 		}
 		sdd->err = B_TRUE;
 	}
 
 	return (rv);
 }
 
 static int
 dump_filesystems(zfs_handle_t *rzhp, void *arg)
 {
 	send_dump_data_t *sdd = arg;
 	nvpair_t *fspair;
 	boolean_t needagain, progress;
 
 	if (!sdd->replicate)
 		return (dump_filesystem(rzhp, sdd));
 
 	/* Mark the clone origin snapshots. */
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *nvfs;
 		uint64_t origin_guid = 0;
 
 		VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs));
 		(void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid);
 		if (origin_guid != 0) {
 			char *snapname;
 			nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
 			    origin_guid, &snapname);
 			if (origin_nv != NULL) {
 				nvlist_t *snapprops;
 				VERIFY(0 == nvlist_lookup_nvlist(origin_nv,
 				    "snapprops", &snapprops));
 				VERIFY(0 == nvlist_lookup_nvlist(snapprops,
 				    snapname, &snapprops));
 				VERIFY(0 == nvlist_add_boolean(
 				    snapprops, "is_clone_origin"));
 			}
 		}
 	}
 again:
 	needagain = progress = B_FALSE;
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *fslist, *parent_nv;
 		char *fsname;
 		zfs_handle_t *zhp;
 		int err;
 		uint64_t origin_guid = 0;
 		uint64_t parent_guid = 0;
 
 		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
 		if (nvlist_lookup_boolean(fslist, "sent") == 0)
 			continue;
 
 		VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0);
 		(void) nvlist_lookup_uint64(fslist, "origin", &origin_guid);
 		(void) nvlist_lookup_uint64(fslist, "parentfromsnap",
 		    &parent_guid);
 
 		if (parent_guid != 0) {
 			parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL);
 			if (!nvlist_exists(parent_nv, "sent")) {
 				/* parent has not been sent; skip this one */
 				needagain = B_TRUE;
 				continue;
 			}
 		}
 
 		if (origin_guid != 0) {
 			nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
 			    origin_guid, NULL);
 			if (origin_nv != NULL &&
 			    !nvlist_exists(origin_nv, "sent")) {
 				/*
 				 * origin has not been sent yet;
 				 * skip this clone.
 				 */
 				needagain = B_TRUE;
 				continue;
 			}
 		}
 
 		zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET);
 		if (zhp == NULL)
 			return (-1);
 		err = dump_filesystem(zhp, sdd);
 		VERIFY(nvlist_add_boolean(fslist, "sent") == 0);
 		progress = B_TRUE;
 		zfs_close(zhp);
 		if (err)
 			return (err);
 	}
 	if (needagain) {
 		assert(progress);
 		goto again;
 	}
 
 	/* clean out the sent flags in case we reuse this fss */
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *fslist;
 
 		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
 		(void) nvlist_remove_all(fslist, "sent");
 	}
 
 	return (0);
 }
 
 nvlist_t *
 zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token)
 {
 	unsigned int version;
 	int nread;
 	unsigned long long checksum, packed_len;
 
 	/*
 	 * Decode token header, which is:
 	 *   <token version>-<checksum of payload>-<uncompressed payload length>
 	 * Note that the only supported token version is 1.
 	 */
 	nread = sscanf(token, "%u-%llx-%llx-",
 	    &version, &checksum, &packed_len);
 	if (nread != 3) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (invalid format)"));
 		return (NULL);
 	}
 
 	if (version != ZFS_SEND_RESUME_TOKEN_VERSION) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (invalid version %u)"),
 		    version);
 		return (NULL);
 	}
 
 	/* convert hexadecimal representation to binary */
 	token = strrchr(token, '-') + 1;
 	int len = strlen(token) / 2;
 	unsigned char *compressed = zfs_alloc(hdl, len);
 	for (int i = 0; i < len; i++) {
 		nread = sscanf(token + i * 2, "%2hhx", compressed + i);
 		if (nread != 1) {
 			free(compressed);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "resume token is corrupt "
 			    "(payload is not hex-encoded)"));
 			return (NULL);
 		}
 	}
 
 	/* verify checksum */
 	zio_cksum_t cksum;
 	fletcher_4_native(compressed, len, NULL, &cksum);
 	if (cksum.zc_word[0] != checksum) {
 		free(compressed);
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (incorrect checksum)"));
 		return (NULL);
 	}
 
 	/* uncompress */
 	void *packed = zfs_alloc(hdl, packed_len);
 	uLongf packed_len_long = packed_len;
 	if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK ||
 	    packed_len_long != packed_len) {
 		free(packed);
 		free(compressed);
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (decompression failed)"));
 		return (NULL);
 	}
 
 	/* unpack nvlist */
 	nvlist_t *nv;
 	int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP);
 	free(packed);
 	free(compressed);
 	if (error != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (nvlist_unpack failed)"));
 		return (NULL);
 	}
 	return (nv);
 }
 
 int
 zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
     const char *resume_token)
 {
 	char errbuf[1024];
 	char *toname;
 	char *fromname = NULL;
 	uint64_t resumeobj, resumeoff, toguid, fromguid, bytes;
 	zfs_handle_t *zhp;
 	int error = 0;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	enum lzc_send_flags lzc_flags = 0;
 	uint64_t size = 0;
 	FILE *fout = (flags->verbose && flags->dryrun) ? stdout : stderr;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot resume send"));
 
 	nvlist_t *resume_nvl =
 	    zfs_send_resume_token_to_nvlist(hdl, resume_token);
 	if (resume_nvl == NULL) {
 		/*
 		 * zfs_error_aux has already been set by
 		 * zfs_send_resume_token_to_nvlist
 		 */
 		return (zfs_error(hdl, EZFS_FAULT, errbuf));
 	}
 	if (flags->verbose) {
 		(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 		    "resume token contents:\n"));
 		nvlist_print(fout, resume_nvl);
 	}
 
 	if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt"));
 		return (zfs_error(hdl, EZFS_FAULT, errbuf));
 	}
 	fromguid = 0;
 	(void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
 
 	if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok"))
 		lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 	if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
 		lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 	if (flags->compress || nvlist_exists(resume_nvl, "compressok"))
 		lzc_flags |= LZC_SEND_FLAG_COMPRESS;
 
 	if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
 		if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is no longer the same snapshot used in "
 			    "the initial send"), toname);
 		} else {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' used in the initial send no longer exists"),
 			    toname);
 		}
 		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 	}
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "unable to access '%s'"), name);
 		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 	}
 
 	if (fromguid != 0) {
 		if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "incremental source %#llx no longer exists"),
 			    (longlong_t)fromguid);
 			return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 		}
 		fromname = name;
 	}
 
-	if (flags->progress) {
+	if (flags->progress || flags->verbose) {
 		error = lzc_send_space(zhp->zfs_name, fromname,
 		    lzc_flags, &size);
 		if (error == 0)
 			size = MAX(0, (int64_t)(size - bytes));
 	}
 	if (flags->verbose) {
 		send_print_verbose(fout, zhp->zfs_name, fromname,
 		    size, flags->parsable);
 	}
 
 	if (!flags->dryrun) {
 		progress_arg_t pa = { 0 };
 		pthread_t tid;
 		/*
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
 		if (flags->progress) {
 			pa.pa_zhp = zhp;
 			pa.pa_fd = outfd;
 			pa.pa_parsable = flags->parsable;
 			pa.pa_size = size;
 			pa.pa_astitle = flags->progressastitle;
 
 			error = pthread_create(&tid, NULL,
 			    send_progress_thread, &pa);
 			if (error != 0) {
 				zfs_close(zhp);
 				return (error);
 			}
 		}
 
 		error = lzc_send_resume(zhp->zfs_name, fromname, outfd,
 		    lzc_flags, resumeobj, resumeoff);
 
 		if (flags->progress) {
 			(void) pthread_cancel(tid);
 			(void) pthread_join(tid, NULL);
 		}
 
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot send '%s'"), zhp->zfs_name);
 
 		zfs_close(zhp);
 
 		switch (error) {
 		case 0:
 			return (0);
 		case EXDEV:
 		case ENOENT:
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 #ifdef illumos
 		case ENOSTR:
 #endif
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 
 	zfs_close(zhp);
 
 	return (error);
 }
 
 /*
  * Generate a send stream for the dataset identified by the argument zhp.
  *
  * The content of the send stream is the snapshot identified by
  * 'tosnap'.  Incremental streams are requested in two ways:
  *     - from the snapshot identified by "fromsnap" (if non-null) or
  *     - from the origin of the dataset identified by zhp, which must
  *	 be a clone.  In this case, "fromsnap" is null and "fromorigin"
  *	 is TRUE.
  *
  * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
  * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM)
  * if "replicate" is set.  If "doall" is set, dump all the intermediate
  * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall"
  * case too. If "props" is set, send properties.
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
     sendflags_t *flags, int outfd, snapfilter_cb_t filter_func,
     void *cb_arg, nvlist_t **debugnvp)
 {
 	char errbuf[1024];
 	send_dump_data_t sdd = { 0 };
 	int err = 0;
 	nvlist_t *fss = NULL;
 	avl_tree_t *fsavl = NULL;
 	static uint64_t holdseq;
 	int spa_version;
 	pthread_t tid = 0;
 	int pipefd[2];
 	dedup_arg_t dda = { 0 };
 	int featureflags = 0;
 	FILE *fout;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot send '%s'"), zhp->zfs_name);
 
 	if (fromsnap && fromsnap[0] == '\0') {
 		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 		    "zero-length incremental source"));
 		return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 	}
 
 	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
 		uint64_t version;
 		version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
 		if (version >= ZPL_VERSION_SA) {
 			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 		}
 	}
 
 	if (flags->dedup && !flags->dryrun) {
 		featureflags |= (DMU_BACKUP_FEATURE_DEDUP |
 		    DMU_BACKUP_FEATURE_DEDUPPROPS);
 		if ((err = pipe(pipefd)) != 0) {
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
 			return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
 			    errbuf));
 		}
 		dda.outputfd = outfd;
 		dda.inputfd = pipefd[1];
 		dda.dedup_hdl = zhp->zfs_hdl;
 		if ((err = pthread_create(&tid, NULL, cksummer, &dda)) != 0) {
 			(void) close(pipefd[0]);
 			(void) close(pipefd[1]);
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
 			return (zfs_error(zhp->zfs_hdl,
 			    EZFS_THREADCREATEFAILED, errbuf));
 		}
 	}
 
 	if (flags->replicate || flags->doall || flags->props) {
 		dmu_replay_record_t drr = { 0 };
 		char *packbuf = NULL;
 		size_t buflen = 0;
 		zio_cksum_t zc = { 0 };
 
 		if (flags->replicate || flags->props) {
 			nvlist_t *hdrnv;
 
 			VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0));
 			if (fromsnap) {
 				VERIFY(0 == nvlist_add_string(hdrnv,
 				    "fromsnap", fromsnap));
 			}
 			VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap));
 			if (!flags->replicate) {
 				VERIFY(0 == nvlist_add_boolean(hdrnv,
 				    "not_recursive"));
 			}
 
 			err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
 			    fromsnap, tosnap, flags->replicate, flags->verbose,
 			    &fss, &fsavl);
 			if (err)
 				goto err_out;
 			VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
 			err = nvlist_pack(hdrnv, &packbuf, &buflen,
 			    NV_ENCODE_XDR, 0);
 			if (debugnvp)
 				*debugnvp = hdrnv;
 			else
 				nvlist_free(hdrnv);
 			if (err)
 				goto stderr_out;
 		}
 
 		if (!flags->dryrun) {
 			/* write first begin record */
 			drr.drr_type = DRR_BEGIN;
 			drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 			DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.
 			    drr_versioninfo, DMU_COMPOUNDSTREAM);
 			DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.
 			    drr_versioninfo, featureflags);
 			(void) snprintf(drr.drr_u.drr_begin.drr_toname,
 			    sizeof (drr.drr_u.drr_begin.drr_toname),
 			    "%s@%s", zhp->zfs_name, tosnap);
 			drr.drr_payloadlen = buflen;
 
 			err = dump_record(&drr, packbuf, buflen, &zc, outfd);
 			free(packbuf);
 			if (err != 0)
 				goto stderr_out;
 
 			/* write end record */
 			bzero(&drr, sizeof (drr));
 			drr.drr_type = DRR_END;
 			drr.drr_u.drr_end.drr_checksum = zc;
 			err = write(outfd, &drr, sizeof (drr));
 			if (err == -1) {
 				err = errno;
 				goto stderr_out;
 			}
 
 			err = 0;
 		}
 	}
 
 	/* dump each stream */
 	sdd.fromsnap = fromsnap;
 	sdd.tosnap = tosnap;
 	if (tid != 0)
 		sdd.outfd = pipefd[0];
 	else
 		sdd.outfd = outfd;
 	sdd.replicate = flags->replicate;
 	sdd.doall = flags->doall;
 	sdd.fromorigin = flags->fromorigin;
 	sdd.fss = fss;
 	sdd.fsavl = fsavl;
 	sdd.verbose = flags->verbose;
 	sdd.parsable = flags->parsable;
 	sdd.progress = flags->progress;
 	sdd.progressastitle = flags->progressastitle;
 	sdd.dryrun = flags->dryrun;
 	sdd.large_block = flags->largeblock;
 	sdd.embed_data = flags->embed_data;
 	sdd.compress = flags->compress;
 	sdd.filter_cb = filter_func;
 	sdd.filter_cb_arg = cb_arg;
 	if (debugnvp)
 		sdd.debugnv = *debugnvp;
 	if (sdd.verbose && sdd.dryrun)
 		sdd.std_out = B_TRUE;
 	fout = sdd.std_out ? stdout : stderr;
 
 	/*
 	 * Some flags require that we place user holds on the datasets that are
 	 * being sent so they don't get destroyed during the send. We can skip
 	 * this step if the pool is imported read-only since the datasets cannot
 	 * be destroyed.
 	 */
 	if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp),
 	    ZPOOL_PROP_READONLY, NULL) &&
 	    zfs_spa_version(zhp, &spa_version) == 0 &&
 	    spa_version >= SPA_VERSION_USERREFS &&
 	    (flags->doall || flags->replicate)) {
 		++holdseq;
 		(void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
 		    ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
 		sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
 		if (sdd.cleanup_fd < 0) {
 			err = errno;
 			goto stderr_out;
 		}
 		sdd.snapholds = fnvlist_alloc();
 	} else {
 		sdd.cleanup_fd = -1;
 		sdd.snapholds = NULL;
 	}
-	if (flags->progress || sdd.snapholds != NULL) {
+	if (flags->progress || flags->verbose || sdd.snapholds != NULL) {
 		/*
 		 * Do a verbose no-op dry run to get all the verbose output
 		 * or to gather snapshot hold's before generating any data,
 		 * then do a non-verbose real run to generate the streams.
 		 */
 		sdd.dryrun = B_TRUE;
 		err = dump_filesystems(zhp, &sdd);
 
 		if (err != 0)
 			goto stderr_out;
 
 		if (flags->verbose) {
 			if (flags->parsable) {
 				(void) fprintf(fout, "size\t%llu\n",
 				    (longlong_t)sdd.size);
 			} else {
 				char buf[16];
 				zfs_nicenum(sdd.size, buf, sizeof (buf));
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "total estimated size is %s\n"), buf);
 			}
 		}
 
 		/* Ensure no snaps found is treated as an error. */
 		if (!sdd.seento) {
 			err = ENOENT;
 			goto err_out;
 		}
 
 		/* Skip the second run if dryrun was requested. */
 		if (flags->dryrun)
 			goto err_out;
 
 		if (sdd.snapholds != NULL) {
 			err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds);
 			if (err != 0)
 				goto stderr_out;
 
 			fnvlist_free(sdd.snapholds);
 			sdd.snapholds = NULL;
 		}
 
 		sdd.dryrun = B_FALSE;
 		sdd.verbose = B_FALSE;
 	}
 
 	err = dump_filesystems(zhp, &sdd);
 	fsavl_destroy(fsavl);
 	nvlist_free(fss);
 
 	/* Ensure no snaps found is treated as an error. */
 	if (err == 0 && !sdd.seento)
 		err = ENOENT;
 
 	if (tid != 0) {
 		if (err != 0)
 			(void) pthread_cancel(tid);
 		(void) close(pipefd[0]);
 		(void) pthread_join(tid, NULL);
 	}
 
 	if (sdd.cleanup_fd != -1) {
 		VERIFY(0 == close(sdd.cleanup_fd));
 		sdd.cleanup_fd = -1;
 	}
 
 	if (!flags->dryrun && (flags->replicate || flags->doall ||
 	    flags->props)) {
 		/*
 		 * write final end record.  NB: want to do this even if
 		 * there was some error, because it might not be totally
 		 * failed.
 		 */
 		dmu_replay_record_t drr = { 0 };
 		drr.drr_type = DRR_END;
 		if (write(outfd, &drr, sizeof (drr)) == -1) {
 			return (zfs_standard_error(zhp->zfs_hdl,
 			    errno, errbuf));
 		}
 	}
 
 	return (err || sdd.err);
 
 stderr_out:
 	err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
 err_out:
 	fsavl_destroy(fsavl);
 	nvlist_free(fss);
 	fnvlist_free(sdd.snapholds);
 
 	if (sdd.cleanup_fd != -1)
 		VERIFY(0 == close(sdd.cleanup_fd));
 	if (tid != 0) {
 		(void) pthread_cancel(tid);
 		(void) close(pipefd[0]);
 		(void) pthread_join(tid, NULL);
 	}
 	return (err);
 }
 
 int
 zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags)
 {
 	int err = 0;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	enum lzc_send_flags lzc_flags = 0;
 	FILE *fout = (flags.verbose && flags.dryrun) ? stdout : stderr;
 	char errbuf[1024];
 
 	if (flags.largeblock)
 		lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 	if (flags.embed_data)
 		lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 	if (flags.compress)
 		lzc_flags |= LZC_SEND_FLAG_COMPRESS;
 
 	if (flags.verbose) {
 		uint64_t size = 0;
 		err = lzc_send_space(zhp->zfs_name, from, lzc_flags, &size);
 		if (err == 0) {
 			send_print_verbose(fout, zhp->zfs_name, from, size,
 			    flags.parsable);
 		} else {
 			(void) fprintf(stderr, "Cannot estimate send size: "
 			    "%s\n", strerror(errno));
 		}
 	}
 
 	if (flags.dryrun)
 		return (err);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "warning: cannot send '%s'"), zhp->zfs_name);
 
 	err = lzc_send(zhp->zfs_name, from, fd, lzc_flags);
 	if (err != 0) {
 		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
 		case ESRCH:
 			if (lzc_exists(zhp->zfs_name)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (%s) does not exist"),
 				    from);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EBUSY:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "target is busy; if a filesystem, "
 			    "it must not be mounted"));
 			return (zfs_error(hdl, EZFS_BUSY, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 #ifdef illumos
 		case ENOSTR:
 #endif
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 	return (err != 0);
 }
 
 /*
  * Routines specific to "zfs recv"
  */
 
 static int
 recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
     boolean_t byteswap, zio_cksum_t *zc)
 {
 	char *cp = buf;
 	int rv;
 	int len = ilen;
 
 	assert(ilen <= SPA_MAXBLOCKSIZE);
 
 	do {
 		rv = read(fd, cp, len);
 		cp += rv;
 		len -= rv;
 	} while (rv > 0);
 
 	if (rv < 0 || len != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "failed to read from stream"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN,
 		    "cannot receive")));
 	}
 
 	if (zc) {
 		if (byteswap)
 			(void) fletcher_4_incremental_byteswap(buf, ilen, zc);
 		else
 			(void) fletcher_4_incremental_native(buf, ilen, zc);
 	}
 	return (0);
 }
 
 static int
 recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp,
     boolean_t byteswap, zio_cksum_t *zc)
 {
 	char *buf;
 	int err;
 
 	buf = zfs_alloc(hdl, len);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	err = recv_read(hdl, fd, buf, len, byteswap, zc);
 	if (err != 0) {
 		free(buf);
 		return (err);
 	}
 
 	err = nvlist_unpack(buf, len, nvp, 0);
 	free(buf);
 	if (err != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (malformed nvlist)"));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
     int baselen, char *newname, recvflags_t *flags)
 {
 	static int seq;
 	int err;
 	prop_changelist_t *clp;
 	zfs_handle_t *zhp;
 
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL)
 		return (-1);
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    flags->force ? MS_FORCE : 0);
 	zfs_close(zhp);
 	if (clp == NULL)
 		return (-1);
 	err = changelist_prefix(clp);
 	if (err)
 		return (err);
 
 	if (tryname) {
 		(void) strcpy(newname, tryname);
 		if (flags->verbose) {
 			(void) printf("attempting rename %s to %s\n",
 			    name, newname);
 		}
 		err = lzc_rename(name, newname);
 		if (err == 0)
 			changelist_rename(clp, name, tryname);
 	} else {
 		err = ENOENT;
 	}
 
 	if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) {
 		seq++;
 
 		(void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN,
 		    "%.*srecv-%u-%u", baselen, name, getpid(), seq);
 		if (flags->verbose) {
 			(void) printf("failed - trying rename %s to %s\n",
 			    name, newname);
 		}
 		err = lzc_rename(name, newname);
 		if (err == 0)
 			changelist_rename(clp, name, newname);
 		if (err && flags->verbose) {
 			(void) printf("failed (%u) - "
 			    "will try again on next pass\n", errno);
 		}
 		err = EAGAIN;
 	} else if (flags->verbose) {
 		if (err == 0)
 			(void) printf("success\n");
 		else
 			(void) printf("failed (%u)\n", errno);
 	}
 
 	(void) changelist_postfix(clp);
 	changelist_free(clp);
 
 	return (err);
 }
 
 static int
 recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
     char *newname, recvflags_t *flags)
 {
 	int err = 0;
 	prop_changelist_t *clp;
 	zfs_handle_t *zhp;
 	boolean_t defer = B_FALSE;
 	int spa_version;
 
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL)
 		return (-1);
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    flags->force ? MS_FORCE : 0);
 	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
 	    zfs_spa_version(zhp, &spa_version) == 0 &&
 	    spa_version >= SPA_VERSION_USERREFS)
 		defer = B_TRUE;
 	zfs_close(zhp);
 	if (clp == NULL)
 		return (-1);
 	err = changelist_prefix(clp);
 	if (err)
 		return (err);
 
 	if (flags->verbose)
 		(void) printf("attempting destroy %s\n", name);
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		nvlist_t *nv = fnvlist_alloc();
 		fnvlist_add_boolean(nv, name);
 		err = lzc_destroy_snaps(nv, defer, NULL);
 		fnvlist_free(nv);
 	} else {
 		err = lzc_destroy(name);
 	}
 	if (err == 0) {
 		if (flags->verbose)
 			(void) printf("success\n");
 		changelist_remove(clp, name);
 	}
 
 	(void) changelist_postfix(clp);
 	changelist_free(clp);
 
 	/*
 	 * Deferred destroy might destroy the snapshot or only mark it to be
 	 * destroyed later, and it returns success in either case.
 	 */
 	if (err != 0 || (defer && zfs_dataset_exists(hdl, name,
 	    ZFS_TYPE_SNAPSHOT))) {
 		err = recv_rename(hdl, name, NULL, baselen, newname, flags);
 	}
 
 	return (err);
 }
 
 typedef struct guid_to_name_data {
 	uint64_t guid;
 	boolean_t bookmark_ok;
 	char *name;
 	char *skip;
 } guid_to_name_data_t;
 
 static int
 guid_to_name_cb(zfs_handle_t *zhp, void *arg)
 {
 	guid_to_name_data_t *gtnd = arg;
 	const char *slash;
 	int err;
 
 	if (gtnd->skip != NULL &&
 	    (slash = strrchr(zhp->zfs_name, '/')) != NULL &&
 	    strcmp(slash + 1, gtnd->skip) == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) {
 		(void) strcpy(gtnd->name, zhp->zfs_name);
 		zfs_close(zhp);
 		return (EEXIST);
 	}
 
 	err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
 	if (err != EEXIST && gtnd->bookmark_ok)
 		err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd);
 	zfs_close(zhp);
 	return (err);
 }
 
 /*
  * Attempt to find the local dataset associated with this guid.  In the case of
  * multiple matches, we attempt to find the "best" match by searching
  * progressively larger portions of the hierarchy.  This allows one to send a
  * tree of datasets individually and guarantee that we will find the source
  * guid within that hierarchy, even if there are multiple matches elsewhere.
  */
 static int
 guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid,
     boolean_t bookmark_ok, char *name)
 {
 	char pname[ZFS_MAX_DATASET_NAME_LEN];
 	guid_to_name_data_t gtnd;
 
 	gtnd.guid = guid;
 	gtnd.bookmark_ok = bookmark_ok;
 	gtnd.name = name;
 	gtnd.skip = NULL;
 
 	/*
 	 * Search progressively larger portions of the hierarchy, starting
 	 * with the filesystem specified by 'parent'.  This will
 	 * select the "most local" version of the origin snapshot in the case
 	 * that there are multiple matching snapshots in the system.
 	 */
 	(void) strlcpy(pname, parent, sizeof (pname));
 	char *cp = strrchr(pname, '@');
 	if (cp == NULL)
 		cp = strchr(pname, '\0');
 	for (; cp != NULL; cp = strrchr(pname, '/')) {
 		/* Chop off the last component and open the parent */
 		*cp = '\0';
 		zfs_handle_t *zhp = make_dataset_handle(hdl, pname);
 
 		if (zhp == NULL)
 			continue;
 		int err = guid_to_name_cb(zfs_handle_dup(zhp), &gtnd);
 		if (err != EEXIST)
 			err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
 		if (err != EEXIST && bookmark_ok)
 			err = zfs_iter_bookmarks(zhp, guid_to_name_cb, &gtnd);
 		zfs_close(zhp);
 		if (err == EEXIST)
 			return (0);
 
 		/*
 		 * Remember the last portion of the dataset so we skip it next
 		 * time through (as we've already searched that portion of the
 		 * hierarchy).
 		 */
 		gtnd.skip = strrchr(pname, '/') + 1;
 	}
 
 	return (ENOENT);
 }
 
 /*
  * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if
  * guid1 is after guid2.
  */
 static int
 created_before(libzfs_handle_t *hdl, avl_tree_t *avl,
     uint64_t guid1, uint64_t guid2)
 {
 	nvlist_t *nvfs;
 	char *fsname, *snapname;
 	char buf[ZFS_MAX_DATASET_NAME_LEN];
 	int rv;
 	zfs_handle_t *guid1hdl, *guid2hdl;
 	uint64_t create1, create2;
 
 	if (guid2 == 0)
 		return (0);
 	if (guid1 == 0)
 		return (1);
 
 	nvfs = fsavl_find(avl, guid1, &snapname);
 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
 	guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
 	if (guid1hdl == NULL)
 		return (-1);
 
 	nvfs = fsavl_find(avl, guid2, &snapname);
 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
 	guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
 	if (guid2hdl == NULL) {
 		zfs_close(guid1hdl);
 		return (-1);
 	}
 
 	create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG);
 	create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG);
 
 	if (create1 < create2)
 		rv = -1;
 	else if (create1 > create2)
 		rv = +1;
 	else
 		rv = 0;
 
 	zfs_close(guid1hdl);
 	zfs_close(guid2hdl);
 
 	return (rv);
 }
 
 static int
 recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
     recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl,
     nvlist_t *renamed)
 {
 	nvlist_t *local_nv, *deleted = NULL;
 	avl_tree_t *local_avl;
 	nvpair_t *fselem, *nextfselem;
 	char *fromsnap;
 	char newname[ZFS_MAX_DATASET_NAME_LEN];
 	char guidname[32];
 	int error;
 	boolean_t needagain, progress, recursive;
 	char *s1, *s2;
 
 	VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
 	if (flags->dryrun)
 		return (0);
 
 again:
 	needagain = progress = B_FALSE;
 
 	VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0));
 
 	if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
 	    recursive, B_FALSE, &local_nv, &local_avl)) != 0)
 		return (error);
 
 	/*
 	 * Process deletes and renames
 	 */
 	for (fselem = nvlist_next_nvpair(local_nv, NULL);
 	    fselem; fselem = nextfselem) {
 		nvlist_t *nvfs, *snaps;
 		nvlist_t *stream_nvfs = NULL;
 		nvpair_t *snapelem, *nextsnapelem;
 		uint64_t fromguid = 0;
 		uint64_t originguid = 0;
 		uint64_t stream_originguid = 0;
 		uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid;
 		char *fsname, *stream_fsname;
 
 		nextfselem = nvlist_next_nvpair(local_nv, fselem);
 
 		VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
 		VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
 		VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 		VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap",
 		    &parent_fromsnap_guid));
 		(void) nvlist_lookup_uint64(nvfs, "origin", &originguid);
 
 		/*
 		 * First find the stream's fs, so we can check for
 		 * a different origin (due to "zfs promote")
 		 */
 		for (snapelem = nvlist_next_nvpair(snaps, NULL);
 		    snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) {
 			uint64_t thisguid;
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
 			stream_nvfs = fsavl_find(stream_avl, thisguid, NULL);
 
 			if (stream_nvfs != NULL)
 				break;
 		}
 
 		/* check for promote */
 		(void) nvlist_lookup_uint64(stream_nvfs, "origin",
 		    &stream_originguid);
 		if (stream_nvfs && originguid != stream_originguid) {
 			switch (created_before(hdl, local_avl,
 			    stream_originguid, originguid)) {
 			case 1: {
 				/* promote it! */
 				zfs_cmd_t zc = { 0 };
 				nvlist_t *origin_nvfs;
 				char *origin_fsname;
 
 				if (flags->verbose)
 					(void) printf("promoting %s\n", fsname);
 
 				origin_nvfs = fsavl_find(local_avl, originguid,
 				    NULL);
 				VERIFY(0 == nvlist_lookup_string(origin_nvfs,
 				    "name", &origin_fsname));
 				(void) strlcpy(zc.zc_value, origin_fsname,
 				    sizeof (zc.zc_value));
 				(void) strlcpy(zc.zc_name, fsname,
 				    sizeof (zc.zc_name));
 				error = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc);
 				if (error == 0)
 					progress = B_TRUE;
 				break;
 			}
 			default:
 				break;
 			case -1:
 				fsavl_destroy(local_avl);
 				nvlist_free(local_nv);
 				return (-1);
 			}
 			/*
 			 * We had/have the wrong origin, therefore our
 			 * list of snapshots is wrong.  Need to handle
 			 * them on the next pass.
 			 */
 			needagain = B_TRUE;
 			continue;
 		}
 
 		for (snapelem = nvlist_next_nvpair(snaps, NULL);
 		    snapelem; snapelem = nextsnapelem) {
 			uint64_t thisguid;
 			char *stream_snapname;
 			nvlist_t *found, *props;
 
 			nextsnapelem = nvlist_next_nvpair(snaps, snapelem);
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
 			found = fsavl_find(stream_avl, thisguid,
 			    &stream_snapname);
 
 			/* check for delete */
 			if (found == NULL) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 
 				if (!flags->force)
 					continue;
 
 				(void) snprintf(name, sizeof (name), "%s@%s",
 				    fsname, nvpair_name(snapelem));
 
 				error = recv_destroy(hdl, name,
 				    strlen(fsname)+1, newname, flags);
 				if (error)
 					needagain = B_TRUE;
 				else
 					progress = B_TRUE;
 				sprintf(guidname, "%" PRIu64, thisguid);
 				nvlist_add_boolean(deleted, guidname);
 				continue;
 			}
 
 			stream_nvfs = found;
 
 			if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops",
 			    &props) && 0 == nvlist_lookup_nvlist(props,
 			    stream_snapname, &props)) {
 				zfs_cmd_t zc = { 0 };
 
 				zc.zc_cookie = B_TRUE; /* received */
 				(void) snprintf(zc.zc_name, sizeof (zc.zc_name),
 				    "%s@%s", fsname, nvpair_name(snapelem));
 				if (zcmd_write_src_nvlist(hdl, &zc,
 				    props) == 0) {
 					(void) zfs_ioctl(hdl,
 					    ZFS_IOC_SET_PROP, &zc);
 					zcmd_free_nvlists(&zc);
 				}
 			}
 
 			/* check for different snapname */
 			if (strcmp(nvpair_name(snapelem),
 			    stream_snapname) != 0) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 				char tryname[ZFS_MAX_DATASET_NAME_LEN];
 
 				(void) snprintf(name, sizeof (name), "%s@%s",
 				    fsname, nvpair_name(snapelem));
 				(void) snprintf(tryname, sizeof (name), "%s@%s",
 				    fsname, stream_snapname);
 
 				error = recv_rename(hdl, name, tryname,
 				    strlen(fsname)+1, newname, flags);
 				if (error)
 					needagain = B_TRUE;
 				else
 					progress = B_TRUE;
 			}
 
 			if (strcmp(stream_snapname, fromsnap) == 0)
 				fromguid = thisguid;
 		}
 
 		/* check for delete */
 		if (stream_nvfs == NULL) {
 			if (!flags->force)
 				continue;
 
 			error = recv_destroy(hdl, fsname, strlen(tofs)+1,
 			    newname, flags);
 			if (error)
 				needagain = B_TRUE;
 			else
 				progress = B_TRUE;
 			sprintf(guidname, "%" PRIu64, parent_fromsnap_guid);
 			nvlist_add_boolean(deleted, guidname);
 			continue;
 		}
 
 		if (fromguid == 0) {
 			if (flags->verbose) {
 				(void) printf("local fs %s does not have "
 				    "fromsnap (%s in stream); must have "
 				    "been deleted locally; ignoring\n",
 				    fsname, fromsnap);
 			}
 			continue;
 		}
 
 		VERIFY(0 == nvlist_lookup_string(stream_nvfs,
 		    "name", &stream_fsname));
 		VERIFY(0 == nvlist_lookup_uint64(stream_nvfs,
 		    "parentfromsnap", &stream_parent_fromsnap_guid));
 
 		s1 = strrchr(fsname, '/');
 		s2 = strrchr(stream_fsname, '/');
 
 		/*
 		 * Check if we're going to rename based on parent guid change
 		 * and the current parent guid was also deleted. If it was then
 		 * rename will fail and is likely unneeded, so avoid this and
 		 * force an early retry to determine the new
 		 * parent_fromsnap_guid.
 		 */
 		if (stream_parent_fromsnap_guid != 0 &&
                     parent_fromsnap_guid != 0 &&
                     stream_parent_fromsnap_guid != parent_fromsnap_guid) {
 			sprintf(guidname, "%" PRIu64, parent_fromsnap_guid);
 			if (nvlist_exists(deleted, guidname)) {
 				progress = B_TRUE;
 				needagain = B_TRUE;
 				goto doagain;
 			}
 		}
 
 		/*
 		 * Check for rename. If the exact receive path is specified, it
 		 * does not count as a rename, but we still need to check the
 		 * datasets beneath it.
 		 */
 		if ((stream_parent_fromsnap_guid != 0 &&
 		    parent_fromsnap_guid != 0 &&
 		    stream_parent_fromsnap_guid != parent_fromsnap_guid) ||
 		    ((flags->isprefix || strcmp(tofs, fsname) != 0) &&
 		    (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
 			nvlist_t *parent;
 			char tryname[ZFS_MAX_DATASET_NAME_LEN];
 
 			parent = fsavl_find(local_avl,
 			    stream_parent_fromsnap_guid, NULL);
 			/*
 			 * NB: parent might not be found if we used the
 			 * tosnap for stream_parent_fromsnap_guid,
 			 * because the parent is a newly-created fs;
 			 * we'll be able to rename it after we recv the
 			 * new fs.
 			 */
 			if (parent != NULL) {
 				char *pname;
 
 				VERIFY(0 == nvlist_lookup_string(parent, "name",
 				    &pname));
 				(void) snprintf(tryname, sizeof (tryname),
 				    "%s%s", pname, strrchr(stream_fsname, '/'));
 			} else {
 				tryname[0] = '\0';
 				if (flags->verbose) {
 					(void) printf("local fs %s new parent "
 					    "not found\n", fsname);
 				}
 			}
 
 			newname[0] = '\0';
 
 			error = recv_rename(hdl, fsname, tryname,
 			    strlen(tofs)+1, newname, flags);
 
 			if (renamed != NULL && newname[0] != '\0') {
 				VERIFY(0 == nvlist_add_boolean(renamed,
 				    newname));
 			}
 
 			if (error)
 				needagain = B_TRUE;
 			else
 				progress = B_TRUE;
 		}
 	}
 
 doagain:
 	fsavl_destroy(local_avl);
 	nvlist_free(local_nv);
 	nvlist_free(deleted);
 
 	if (needagain && progress) {
 		/* do another pass to fix up temporary names */
 		if (flags->verbose)
 			(void) printf("another pass:\n");
 		goto again;
 	}
 
 	return (needagain);
 }
 
 static int
 zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
     recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc,
     char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
 {
 	nvlist_t *stream_nv = NULL;
 	avl_tree_t *stream_avl = NULL;
 	char *fromsnap = NULL;
 	char *sendsnap = NULL;
 	char *cp;
 	char tofs[ZFS_MAX_DATASET_NAME_LEN];
 	char sendfs[ZFS_MAX_DATASET_NAME_LEN];
 	char errbuf[1024];
 	dmu_replay_record_t drre;
 	int error;
 	boolean_t anyerr = B_FALSE;
 	boolean_t softerr = B_FALSE;
 	boolean_t recursive;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	assert(drr->drr_type == DRR_BEGIN);
 	assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
 	assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM);
 
 	/*
 	 * Read in the nvlist from the stream.
 	 */
 	if (drr->drr_payloadlen != 0) {
 		error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
 		    &stream_nv, flags->byteswap, zc);
 		if (error) {
 			error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			goto out;
 		}
 	}
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
 	if (recursive && strchr(destname, '@')) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "cannot specify snapshot name for multi-snapshot stream"));
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 
 	/*
 	 * Read in the end record and verify checksum.
 	 */
 	if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre),
 	    flags->byteswap, NULL)))
 		goto out;
 	if (flags->byteswap) {
 		drre.drr_type = BSWAP_32(drre.drr_type);
 		drre.drr_u.drr_end.drr_checksum.zc_word[0] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[1] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[2] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[3] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]);
 	}
 	if (drre.drr_type != DRR_END) {
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 	if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "incorrect header checksum"));
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 
 	(void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap);
 
 	if (drr->drr_payloadlen != 0) {
 		nvlist_t *stream_fss;
 
 		VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss",
 		    &stream_fss));
 		if ((stream_avl = fsavl_create(stream_fss)) == NULL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "couldn't allocate avl tree"));
 			error = zfs_error(hdl, EZFS_NOMEM, errbuf);
 			goto out;
 		}
 
 		if (fromsnap != NULL && recursive) {
 			nvlist_t *renamed = NULL;
 			nvpair_t *pair = NULL;
 
 			(void) strlcpy(tofs, destname, sizeof (tofs));
 			if (flags->isprefix) {
 				struct drr_begin *drrb = &drr->drr_u.drr_begin;
 				int i;
 
 				if (flags->istail) {
 					cp = strrchr(drrb->drr_toname, '/');
 					if (cp == NULL) {
 						(void) strlcat(tofs, "/",
 						    sizeof (tofs));
 						i = 0;
 					} else {
 						i = (cp - drrb->drr_toname);
 					}
 				} else {
 					i = strcspn(drrb->drr_toname, "/@");
 				}
 				/* zfs_receive_one() will create_parents() */
 				(void) strlcat(tofs, &drrb->drr_toname[i],
 				    sizeof (tofs));
 				*strchr(tofs, '@') = '\0';
 			}
 
 			if (!flags->dryrun && !flags->nomount) {
 				VERIFY(0 == nvlist_alloc(&renamed,
 				    NV_UNIQUE_NAME, 0));
 			}
 
 			softerr = recv_incremental_replication(hdl, tofs, flags,
 			    stream_nv, stream_avl, renamed);
 
 			/* Unmount renamed filesystems before receiving. */
 			while ((pair = nvlist_next_nvpair(renamed,
 			    pair)) != NULL) {
 				zfs_handle_t *zhp;
 				prop_changelist_t *clp = NULL;
 
 				zhp = zfs_open(hdl, nvpair_name(pair),
 				    ZFS_TYPE_FILESYSTEM);
 				if (zhp != NULL) {
 					clp = changelist_gather(zhp,
 					    ZFS_PROP_MOUNTPOINT, 0, 0);
 					zfs_close(zhp);
 					if (clp != NULL) {
 						softerr |=
 						    changelist_prefix(clp);
 						changelist_free(clp);
 					}
 				}
 			}
 
 			nvlist_free(renamed);
 		}
 	}
 
 	/*
 	 * Get the fs specified by the first path in the stream (the top level
 	 * specified by 'zfs send') and pass it to each invocation of
 	 * zfs_receive_one().
 	 */
 	(void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname,
 	    sizeof (sendfs));
 	if ((cp = strchr(sendfs, '@')) != NULL) {
 		*cp = '\0';
 		/*
 		 * Find the "sendsnap", the final snapshot in a replication
 		 * stream.  zfs_receive_one() handles certain errors
 		 * differently, depending on if the contained stream is the
 		 * last one or not.
 		 */
 		sendsnap = (cp + 1);
 	}
 
 	/* Finally, receive each contained stream */
 	do {
 		/*
 		 * we should figure out if it has a recoverable
 		 * error, in which case do a recv_skip() and drive on.
 		 * Note, if we fail due to already having this guid,
 		 * zfs_receive_one() will take care of it (ie,
 		 * recv_skip() and return 0).
 		 */
 		error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
 		    sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
 		    action_handlep, sendsnap);
 		if (error == ENODATA) {
 			error = 0;
 			break;
 		}
 		anyerr |= error;
 	} while (error == 0);
 
 	if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) {
 		/*
 		 * Now that we have the fs's they sent us, try the
 		 * renames again.
 		 */
 		softerr = recv_incremental_replication(hdl, tofs, flags,
 		    stream_nv, stream_avl, NULL);
 	}
 
 out:
 	fsavl_destroy(stream_avl);
 	nvlist_free(stream_nv);
 	if (softerr)
 		error = -2;
 	if (anyerr)
 		error = -1;
 	return (error);
 }
 
 static void
 trunc_prop_errs(int truncated)
 {
 	ASSERT(truncated != 0);
 
 	if (truncated == 1)
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "1 more property could not be set\n"));
 	else
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "%d more properties could not be set\n"), truncated);
 }
 
 static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
 	dmu_replay_record_t *drr;
 	void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive:"));
 
 	/* XXX would be great to use lseek if possible... */
 	drr = buf;
 
 	while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t),
 	    byteswap, NULL) == 0) {
 		if (byteswap)
 			drr->drr_type = BSWAP_32(drr->drr_type);
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
 			if (drr->drr_payloadlen != 0) {
 				(void) recv_read(hdl, fd, buf,
 				    drr->drr_payloadlen, B_FALSE, NULL);
 			}
 			break;
 
 		case DRR_END:
 			free(buf);
 			return (0);
 
 		case DRR_OBJECT:
 			if (byteswap) {
 				drr->drr_u.drr_object.drr_bonuslen =
 				    BSWAP_32(drr->drr_u.drr_object.
 				    drr_bonuslen);
 			}
 			(void) recv_read(hdl, fd, buf,
 			    P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8),
 			    B_FALSE, NULL);
 			break;
 
 		case DRR_WRITE:
 			if (byteswap) {
 				drr->drr_u.drr_write.drr_logical_size =
 				    BSWAP_64(
 				    drr->drr_u.drr_write.drr_logical_size);
 				drr->drr_u.drr_write.drr_compressed_size =
 				    BSWAP_64(
 				    drr->drr_u.drr_write.drr_compressed_size);
 			}
 			uint64_t payload_size =
 			    DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write);
 			(void) recv_read(hdl, fd, buf,
 			    payload_size, B_FALSE, NULL);
 			break;
 		case DRR_SPILL:
 			if (byteswap) {
 				drr->drr_u.drr_spill.drr_length =
 				    BSWAP_64(drr->drr_u.drr_spill.drr_length);
 			}
 			(void) recv_read(hdl, fd, buf,
 			    drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
 			break;
 		case DRR_WRITE_EMBEDDED:
 			if (byteswap) {
 				drr->drr_u.drr_write_embedded.drr_psize =
 				    BSWAP_32(drr->drr_u.drr_write_embedded.
 				    drr_psize);
 			}
 			(void) recv_read(hdl, fd, buf,
 			    P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
 			    8), B_FALSE, NULL);
 			break;
 		case DRR_WRITE_BYREF:
 		case DRR_FREEOBJECTS:
 		case DRR_FREE:
 			break;
 
 		default:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid record type"));
 			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 		}
 	}
 
 	free(buf);
 	return (-1);
 }
 
 static void
 recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap,
     boolean_t resumable)
 {
 	char target_fs[ZFS_MAX_DATASET_NAME_LEN];
 
 	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 	    "checksum mismatch or incomplete stream"));
 
 	if (!resumable)
 		return;
 	(void) strlcpy(target_fs, target_snap, sizeof (target_fs));
 	*strchr(target_fs, '@') = '\0';
 	zfs_handle_t *zhp = zfs_open(hdl, target_fs,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return;
 
 	char token_buf[ZFS_MAXPROPLEN];
 	int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	    token_buf, sizeof (token_buf),
 	    NULL, NULL, 0, B_TRUE);
 	if (error == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "checksum mismatch or incomplete stream.\n"
 		    "Partially received snapshot is saved.\n"
 		    "A resuming stream can be generated on the sending "
 		    "system by running:\n"
 		    "    zfs send -t %s"),
 		    token_buf);
 	}
 	zfs_close(zhp);
 }
 
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  */
 static int
 zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
     const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr,
     dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv,
     avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
     uint64_t *action_handlep, const char *finalsnap)
 {
 	zfs_cmd_t zc = { 0 };
 	time_t begin_time;
 	int ioctl_err, ioctl_errno, err;
 	char *cp;
 	struct drr_begin *drrb = &drr->drr_u.drr_begin;
 	char errbuf[1024];
 	char prop_errbuf[1024];
 	const char *chopprefix;
 	boolean_t newfs = B_FALSE;
 	boolean_t stream_wantsnewfs;
 	uint64_t parent_snapguid = 0;
 	prop_changelist_t *clp = NULL;
 	nvlist_t *snapprops_nvlist = NULL;
 	zprop_errflags_t prop_errflags;
 	boolean_t recursive;
 	char *snapname = NULL;
 
 	begin_time = time(NULL);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
 	if (stream_avl != NULL) {
 		nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid,
 		    &snapname);
 		nvlist_t *props;
 		int ret;
 
 		(void) nvlist_lookup_uint64(fs, "parentfromsnap",
 		    &parent_snapguid);
 		err = nvlist_lookup_nvlist(fs, "props", &props);
 		if (err)
 			VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
 
 		if (flags->canmountoff) {
 			VERIFY(0 == nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0));
 		}
 		ret = zcmd_write_src_nvlist(hdl, &zc, props);
 		if (err)
 			nvlist_free(props);
 
 		if (0 == nvlist_lookup_nvlist(fs, "snapprops", &props)) {
 			VERIFY(0 == nvlist_lookup_nvlist(props,
 			    snapname, &snapprops_nvlist));
 		}
 
 		if (ret != 0)
 			return (-1);
 	}
 
 	cp = NULL;
 
 	/*
 	 * Determine how much of the snapshot name stored in the stream
 	 * we are going to tack on to the name they specified on the
 	 * command line, and how much we are going to chop off.
 	 *
 	 * If they specified a snapshot, chop the entire name stored in
 	 * the stream.
 	 */
 	if (flags->istail) {
 		/*
 		 * A filesystem was specified with -e. We want to tack on only
 		 * the tail of the sent snapshot path.
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 			    "argument - snapshot not allowed with -e"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 
 		chopprefix = strrchr(sendfs, '/');
 
 		if (chopprefix == NULL) {
 			/*
 			 * The tail is the poolname, so we need to
 			 * prepend a path separator.
 			 */
 			int len = strlen(drrb->drr_toname);
 			cp = malloc(len + 2);
 			cp[0] = '/';
 			(void) strcpy(&cp[1], drrb->drr_toname);
 			chopprefix = cp;
 		} else {
 			chopprefix = drrb->drr_toname + (chopprefix - sendfs);
 		}
 	} else if (flags->isprefix) {
 		/*
 		 * A filesystem was specified with -d. We want to tack on
 		 * everything but the first element of the sent snapshot path
 		 * (all but the pool name).
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 			    "argument - snapshot not allowed with -d"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 
 		chopprefix = strchr(drrb->drr_toname, '/');
 		if (chopprefix == NULL)
 			chopprefix = strchr(drrb->drr_toname, '@');
 	} else if (strchr(tosnap, '@') == NULL) {
 		/*
 		 * If a filesystem was specified without -d or -e, we want to
 		 * tack on everything after the fs specified by 'zfs send'.
 		 */
 		chopprefix = drrb->drr_toname + strlen(sendfs);
 	} else {
 		/* A snapshot was specified as an exact path (no -d or -e). */
 		if (recursive) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "cannot specify snapshot name for multi-snapshot "
 			    "stream"));
 			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 		}
 		chopprefix = drrb->drr_toname + strlen(drrb->drr_toname);
 	}
 
 	ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname);
 	ASSERT(chopprefix > drrb->drr_toname);
 	ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname));
 	ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' ||
 	    chopprefix[0] == '\0');
 
 	/*
 	 * Determine name of destination snapshot, store in zc_value.
 	 */
 	(void) strcpy(zc.zc_value, tosnap);
 	(void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value));
 #ifdef __FreeBSD__
 	if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
 		zfs_ioctl_version = get_zfs_ioctl_version();
 	/*
 	 * For forward compatibility hide tosnap in zc_value
 	 */
 	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
 		(void) strcpy(zc.zc_value + strlen(zc.zc_value) + 1, tosnap);
 #endif
 	free(cp);
 	if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) {
 		zcmd_free_nvlists(&zc);
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	}
 
 	/*
 	 * Determine the name of the origin snapshot, store in zc_string.
 	 */
 	if (originsnap) {
 		(void) strncpy(zc.zc_string, originsnap, sizeof (zc.zc_string));
 		if (flags->verbose)
 			(void) printf("using provided clone origin %s\n",
 			    zc.zc_string);
 	} else if (drrb->drr_flags & DRR_FLAG_CLONE) {
 		if (guid_to_name(hdl, zc.zc_value,
 		    drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) {
 			zcmd_free_nvlists(&zc);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "local origin for clone %s does not exist"),
 			    zc.zc_value);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 		}
 		if (flags->verbose)
 			(void) printf("found clone origin %s\n", zc.zc_string);
 	}
 
 	boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_RESUMING;
 	stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
 	    (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming;
 
 	if (stream_wantsnewfs) {
 		/*
 		 * if the parent fs does not exist, look for it based on
 		 * the parent snap GUID
 		 */
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot receive new filesystem stream"));
 
 		(void) strcpy(zc.zc_name, zc.zc_value);
 		cp = strrchr(zc.zc_name, '/');
 		if (cp)
 			*cp = '\0';
 		if (cp &&
 		    !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
 			char suffix[ZFS_MAX_DATASET_NAME_LEN];
 			(void) strcpy(suffix, strrchr(zc.zc_value, '/'));
 			if (guid_to_name(hdl, zc.zc_name, parent_snapguid,
 			    B_FALSE, zc.zc_value) == 0) {
 				*strchr(zc.zc_value, '@') = '\0';
 				(void) strcat(zc.zc_value, suffix);
 			}
 		}
 	} else {
 		/*
 		 * If the fs does not exist, look for it based on the
 		 * fromsnap GUID.
 		 */
 		if (resuming) {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot receive resume stream"));
 		} else {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot receive incremental stream"));
 		}
 
 		(void) strcpy(zc.zc_name, zc.zc_value);
 		*strchr(zc.zc_name, '@') = '\0';
 
 		/*
 		 * If the exact receive path was specified and this is the
 		 * topmost path in the stream, then if the fs does not exist we
 		 * should look no further.
 		 */
 		if ((flags->isprefix || (*(chopprefix = drrb->drr_toname +
 		    strlen(sendfs)) != '\0' && *chopprefix != '@')) &&
 		    !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
 			char snap[ZFS_MAX_DATASET_NAME_LEN];
 			(void) strcpy(snap, strchr(zc.zc_value, '@'));
 			if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid,
 			    B_FALSE, zc.zc_value) == 0) {
 				*strchr(zc.zc_value, '@') = '\0';
 				(void) strcat(zc.zc_value, snap);
 			}
 		}
 	}
 
 	(void) strcpy(zc.zc_name, zc.zc_value);
 	*strchr(zc.zc_name, '@') = '\0';
 
 	if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
 		zfs_handle_t *zhp;
 
 		/*
 		 * Destination fs exists.  It must be one of these cases:
 		 *  - an incremental send stream
 		 *  - the stream specifies a new fs (full stream or clone)
 		 *    and they want us to blow away the existing fs (and
 		 *    have therefore specified -F and removed any snapshots)
 		 *  - we are resuming a failed receive.
 		 */
 		if (stream_wantsnewfs) {
 			if (!flags->force) {
 				zcmd_free_nvlists(&zc);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination '%s' exists\n"
 				    "must specify -F to overwrite it"),
 				    zc.zc_name);
 				return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 			}
 			if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT,
 			    &zc) == 0) {
 				zcmd_free_nvlists(&zc);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination has snapshots (eg. %s)\n"
 				    "must destroy them to overwrite it"),
 				    zc.zc_name);
 				return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 			}
 		}
 
 		if ((zhp = zfs_open(hdl, zc.zc_name,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 
 		if (stream_wantsnewfs &&
 		    zhp->zfs_dmustats.dds_origin[0]) {
 			zcmd_free_nvlists(&zc);
 			zfs_close(zhp);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination '%s' is a clone\n"
 			    "must destroy it to overwrite it"),
 			    zc.zc_name);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 		}
 
 		if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
 		    stream_wantsnewfs) {
 			/* We can't do online recv in this case */
 			clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0);
 			if (clp == NULL) {
 				zfs_close(zhp);
 				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 			if (changelist_prefix(clp) != 0) {
 				changelist_free(clp);
 				zfs_close(zhp);
 				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 		}
 
 		/*
 		 * If we are resuming a newfs, set newfs here so that we will
 		 * mount it if the recv succeeds this time.  We can tell
 		 * that it was a newfs on the first recv because the fs
 		 * itself will be inconsistent (if the fs existed when we
 		 * did the first recv, we would have received it into
 		 * .../%recv).
 		 */
 		if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT))
 			newfs = B_TRUE;
 
 		zfs_close(zhp);
 	} else {
 		/*
 		 * Destination filesystem does not exist.  Therefore we better
 		 * be creating a new filesystem (either from a full backup, or
 		 * a clone).  It would therefore be invalid if the user
 		 * specified only the pool name (i.e. if the destination name
 		 * contained no slash character).
 		 */
 		if (!stream_wantsnewfs ||
 		    (cp = strrchr(zc.zc_name, '/')) == NULL) {
 			zcmd_free_nvlists(&zc);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination '%s' does not exist"), zc.zc_name);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 		}
 
 		/*
 		 * Trim off the final dataset component so we perform the
 		 * recvbackup ioctl to the filesystems's parent.
 		 */
 		*cp = '\0';
 
 		if (flags->isprefix && !flags->istail && !flags->dryrun &&
 		    create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) {
 			zcmd_free_nvlists(&zc);
 			return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
 		}
 
 		newfs = B_TRUE;
 	}
 
 	zc.zc_begin_record = *drr_noswap;
 	zc.zc_cookie = infd;
 	zc.zc_guid = flags->force;
 	zc.zc_resumable = flags->resumable;
 	if (flags->verbose) {
 		(void) printf("%s %s stream of %s into %s\n",
 		    flags->dryrun ? "would receive" : "receiving",
 		    drrb->drr_fromguid ? "incremental" : "full",
 		    drrb->drr_toname, zc.zc_value);
 		(void) fflush(stdout);
 	}
 
 	if (flags->dryrun) {
 		zcmd_free_nvlists(&zc);
 		return (recv_skip(hdl, infd, flags->byteswap));
 	}
 
 	zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf;
 	zc.zc_nvlist_dst_size = sizeof (prop_errbuf);
 	zc.zc_cleanup_fd = cleanup_fd;
 	zc.zc_action_handle = *action_handlep;
 
 	err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
 	ioctl_errno = errno;
 	prop_errflags = (zprop_errflags_t)zc.zc_obj;
 
 	if (err == 0) {
 		nvlist_t *prop_errors;
 		VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
 		    zc.zc_nvlist_dst_size, &prop_errors, 0));
 
 		nvpair_t *prop_err = NULL;
 
 		while ((prop_err = nvlist_next_nvpair(prop_errors,
 		    prop_err)) != NULL) {
 			char tbuf[1024];
 			zfs_prop_t prop;
 			int intval;
 
 			prop = zfs_name_to_prop(nvpair_name(prop_err));
 			(void) nvpair_value_int32(prop_err, &intval);
 			if (strcmp(nvpair_name(prop_err),
 			    ZPROP_N_MORE_ERRORS) == 0) {
 				trunc_prop_errs(intval);
 				break;
 			} else if (snapname == NULL || finalsnap == NULL ||
 			    strcmp(finalsnap, snapname) == 0 ||
 			    strcmp(nvpair_name(prop_err),
 			    zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) {
 				/*
 				 * Skip the special case of, for example,
 				 * "refquota", errors on intermediate
 				 * snapshots leading up to a final one.
 				 * That's why we have all of the checks above.
 				 *
 				 * See zfs_ioctl.c's extract_delay_props() for
 				 * a list of props which can fail on
 				 * intermediate snapshots, but shouldn't
 				 * affect the overall receive.
 				 */
 				(void) snprintf(tbuf, sizeof (tbuf),
 				    dgettext(TEXT_DOMAIN,
 				    "cannot receive %s property on %s"),
 				    nvpair_name(prop_err), zc.zc_name);
 				zfs_setprop_error(hdl, prop, intval, tbuf);
 			}
 		}
 		nvlist_free(prop_errors);
 	}
 
 	zc.zc_nvlist_dst = 0;
 	zc.zc_nvlist_dst_size = 0;
 	zcmd_free_nvlists(&zc);
 
 	if (err == 0 && snapprops_nvlist) {
 		zfs_cmd_t zc2 = { 0 };
 
 		(void) strcpy(zc2.zc_name, zc.zc_value);
 		zc2.zc_cookie = B_TRUE; /* received */
 		if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) {
 			(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2);
 			zcmd_free_nvlists(&zc2);
 		}
 	}
 
 	if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) {
 		/*
 		 * It may be that this snapshot already exists,
 		 * in which case we want to consume & ignore it
 		 * rather than failing.
 		 */
 		avl_tree_t *local_avl;
 		nvlist_t *local_nv, *fs;
 		cp = strchr(zc.zc_value, '@');
 
 		/*
 		 * XXX Do this faster by just iterating over snaps in
 		 * this fs.  Also if zc_value does not exist, we will
 		 * get a strange "does not exist" error message.
 		 */
 		*cp = '\0';
 		if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE,
 		    B_FALSE, &local_nv, &local_avl) == 0) {
 			*cp = '@';
 			fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
 			fsavl_destroy(local_avl);
 			nvlist_free(local_nv);
 
 			if (fs != NULL) {
 				if (flags->verbose) {
 					(void) printf("snap %s already exists; "
 					    "ignoring\n", zc.zc_value);
 				}
 				err = ioctl_err = recv_skip(hdl, infd,
 				    flags->byteswap);
 			}
 		}
 		*cp = '@';
 	}
 
 	if (ioctl_err != 0) {
 		switch (ioctl_errno) {
 		case ENODEV:
 			cp = strchr(zc.zc_value, '@');
 			*cp = '\0';
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "most recent snapshot of %s does not\n"
 			    "match incremental source"), zc.zc_value);
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			*cp = '@';
 			break;
 		case ETXTBSY:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination %s has been modified\n"
 			    "since most recent snapshot"), zc.zc_name);
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			break;
 		case EEXIST:
 			cp = strchr(zc.zc_value, '@');
 			if (newfs) {
 				/* it's the containing fs that exists */
 				*cp = '\0';
 			}
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination already exists"));
 			(void) zfs_error_fmt(hdl, EZFS_EXISTS,
 			    dgettext(TEXT_DOMAIN, "cannot restore to %s"),
 			    zc.zc_value);
 			*cp = '@';
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ECKSUM:
 			recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable);
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded to receive this stream."));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EDQUOT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination %s space quota exceeded"), zc.zc_name);
 			(void) zfs_error(hdl, EZFS_NOSPC, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl, ioctl_errno, errbuf);
 		}
 	}
 
 	/*
 	 * Mount the target filesystem (if created).  Also mount any
 	 * children of the target filesystem if we did a replication
 	 * receive (indicated by stream_avl being non-NULL).
 	 */
 	cp = strchr(zc.zc_value, '@');
 	if (cp && (ioctl_err == 0 || !newfs)) {
 		zfs_handle_t *h;
 
 		*cp = '\0';
 		h = zfs_open(hdl, zc.zc_value,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (h != NULL) {
 			if (h->zfs_type == ZFS_TYPE_VOLUME) {
 				*cp = '@';
 			} else if (newfs || stream_avl) {
 				/*
 				 * Track the first/top of hierarchy fs,
 				 * for mounting and sharing later.
 				 */
 				if (top_zfs && *top_zfs == NULL)
 					*top_zfs = zfs_strdup(hdl, zc.zc_value);
 			}
 			zfs_close(h);
 		}
 		*cp = '@';
 	}
 
 	if (clp) {
 		if (!flags->nomount)
 			err |= changelist_postfix(clp);
 		changelist_free(clp);
 	}
 
 	if (prop_errflags & ZPROP_ERR_NOCLEAR) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
 		    "failed to clear unreceived properties on %s"),
 		    zc.zc_name);
 		(void) fprintf(stderr, "\n");
 	}
 	if (prop_errflags & ZPROP_ERR_NORESTORE) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
 		    "failed to restore original properties on %s"),
 		    zc.zc_name);
 		(void) fprintf(stderr, "\n");
 	}
 
 	if (err || ioctl_err)
 		return (-1);
 
 	*action_handlep = zc.zc_action_handle;
 
 	if (flags->verbose) {
 		char buf1[64];
 		char buf2[64];
 		uint64_t bytes = zc.zc_cookie;
 		time_t delta = time(NULL) - begin_time;
 		if (delta == 0)
 			delta = 1;
 		zfs_nicenum(bytes, buf1, sizeof (buf1));
 		zfs_nicenum(bytes/delta, buf2, sizeof (buf1));
 
 		(void) printf("received %sB stream in %lu seconds (%sB/sec)\n",
 		    buf1, delta, buf2);
 	}
 
 	return (0);
 }
 
 static int
 zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
     const char *originsnap, recvflags_t *flags, int infd, const char *sendfs,
     nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
     uint64_t *action_handlep, const char *finalsnap)
 {
 	int err;
 	dmu_replay_record_t drr, drr_noswap;
 	struct drr_begin *drrb = &drr.drr_u.drr_begin;
 	char errbuf[1024];
 	zio_cksum_t zcksum = { 0 };
 	uint64_t featureflags;
 	int hdrtype;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	if (flags->isprefix &&
 	    !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs "
 		    "(%s) does not exist"), tosnap);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 	if (originsnap &&
 	    !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
 		    "(%s) does not exist"), originsnap);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 
 	/* read in the BEGIN record */
 	if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
 	    &zcksum)))
 		return (err);
 
 	if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) {
 		/* It's the double end record at the end of a package */
 		return (ENODATA);
 	}
 
 	/* the kernel needs the non-byteswapped begin record */
 	drr_noswap = drr;
 
 	flags->byteswap = B_FALSE;
 	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		/*
 		 * We computed the checksum in the wrong byteorder in
 		 * recv_read() above; do it again correctly.
 		 */
 		bzero(&zcksum, sizeof (zio_cksum_t));
 		(void) fletcher_4_incremental_byteswap(&drr,
 		    sizeof (drr), &zcksum);
 		flags->byteswap = B_TRUE;
 
 		drr.drr_type = BSWAP_32(drr.drr_type);
 		drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
 		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
 		drrb->drr_type = BSWAP_32(drrb->drr_type);
 		drrb->drr_flags = BSWAP_32(drrb->drr_flags);
 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
 	}
 
 	if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad magic number)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo);
 
 	if (!DMU_STREAM_SUPPORTED(featureflags) ||
 	    (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "stream has unsupported feature, feature flags = %lx"),
 		    featureflags);
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (strchr(drrb->drr_toname, '@') == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad snapshot name)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
 		char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN];
 		if (sendfs == NULL) {
 			/*
 			 * We were not called from zfs_receive_package(). Get
 			 * the fs specified by 'zfs send'.
 			 */
 			char *cp;
 			(void) strlcpy(nonpackage_sendfs,
 			    drr.drr_u.drr_begin.drr_toname,
 			    sizeof (nonpackage_sendfs));
 			if ((cp = strchr(nonpackage_sendfs, '@')) != NULL)
 				*cp = '\0';
 			sendfs = nonpackage_sendfs;
 			VERIFY(finalsnap == NULL);
 		}
 		return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
 		    &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
 		    cleanup_fd, action_handlep, finalsnap));
 	} else {
 		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 		    DMU_COMPOUNDSTREAM);
 		return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
 		    &zcksum, top_zfs, cleanup_fd, action_handlep));
 	}
 }
 
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  * Return 0 on total success, -2 if some things couldn't be
  * destroyed/renamed/promoted, -1 if some things couldn't be received.
  * (-1 will override -2, if -1 and the resumable flag was specified the
  * transfer can be resumed if the sending side supports it).
  */
 int
 zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
     recvflags_t *flags, int infd, avl_tree_t *stream_avl)
 {
 	char *top_zfs = NULL;
 	int err;
 	int cleanup_fd;
 	uint64_t action_handle = 0;
 	char *originsnap = NULL;
 	if (props) {
 		err = nvlist_lookup_string(props, "origin", &originsnap);
 		if (err && err != ENOENT)
 			return (err);
 	}
 
 	cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
 	VERIFY(cleanup_fd >= 0);
 
 	err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
 	    stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL);
 
 	VERIFY(0 == close(cleanup_fd));
 
 	if (err == 0 && !flags->nomount && top_zfs) {
 		zfs_handle_t *zhp;
 		prop_changelist_t *clp;
 
 		zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM);
 		if (zhp != NULL) {
 			clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT,
 			    CL_GATHER_MOUNT_ALWAYS, 0);
 			zfs_close(zhp);
 			if (clp != NULL) {
 				/* mount and share received datasets */
 				err = changelist_postfix(clp);
 				changelist_free(clp);
 			}
 		}
 		if (zhp == NULL || clp == NULL || err)
 			err = -1;
 	}
 	if (top_zfs)
 		free(top_zfs);
 
 	return (err);
 }
Index: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs
===================================================================
--- projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs	(revision 352586)
+++ projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs	(revision 352587)

Property changes on: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/cddl/contrib/opensolaris/lib/libzfs:r352537-352586
Index: projects/clang900-import/cddl/contrib/opensolaris
===================================================================
--- projects/clang900-import/cddl/contrib/opensolaris	(revision 352586)
+++ projects/clang900-import/cddl/contrib/opensolaris	(revision 352587)

Property changes on: projects/clang900-import/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/cddl/contrib/opensolaris:r352537-352586
Index: projects/clang900-import/cddl
===================================================================
--- projects/clang900-import/cddl	(revision 352586)
+++ projects/clang900-import/cddl	(revision 352587)

Property changes on: projects/clang900-import/cddl
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/cddl:r352537-352586
Index: projects/clang900-import/contrib/ntp/ntpd/ntpd.c
===================================================================
--- projects/clang900-import/contrib/ntp/ntpd/ntpd.c	(revision 352586)
+++ projects/clang900-import/contrib/ntp/ntpd/ntpd.c	(revision 352587)
@@ -1,1760 +1,1760 @@
 /*
  * ntpd.c - main program for the fixed point NTP daemon
  */
 
 #ifdef HAVE_CONFIG_H
 # include <config.h>
 #endif
 
 #include "ntp_machine.h"
 #include "ntpd.h"
 #include "ntp_io.h"
 #include "ntp_stdlib.h"
 #include <ntp_random.h>
 
 #include "ntp_config.h"
 #include "ntp_syslog.h"
 #include "ntp_assert.h"
 #include "isc/error.h"
 #include "isc/strerror.h"
 #include "isc/formatcheck.h"
 #include "iosignal.h"
 
 #ifdef SIM
 # include "ntpsim.h"
 #endif
 
 #include "ntp_libopts.h"
 #include "ntpd-opts.h"
 
 /* there's a short treatise below what the thread stuff is for.
  * [Bug 2954] enable the threading warm-up only for Linux.
  */
 #if defined(HAVE_PTHREADS) && HAVE_PTHREADS && !defined(NO_THREADS)
 # ifdef HAVE_PTHREAD_H
 #  include <pthread.h>
 # endif
 # if defined(linux)
 #  define NEED_PTHREAD_WARMUP
 # endif
 #endif
 
 #ifdef HAVE_UNISTD_H
 # include <unistd.h>
 #endif
 #ifdef HAVE_SYS_STAT_H
 # include <sys/stat.h>
 #endif
 #include <stdio.h>
 #ifdef HAVE_SYS_PARAM_H
 # include <sys/param.h>
 #endif
 #ifdef HAVE_SYS_SIGNAL_H
 # include <sys/signal.h>
 #else
 # include <signal.h>
 #endif
 #ifdef HAVE_SYS_IOCTL_H
 # include <sys/ioctl.h>
 #endif /* HAVE_SYS_IOCTL_H */
 #if defined(HAVE_RTPRIO)
 # ifdef HAVE_SYS_LOCK_H
 #  include <sys/lock.h>
 # endif
 # include <sys/rtprio.h>
 #else
 # ifdef HAVE_PLOCK
 #  ifdef HAVE_SYS_LOCK_H
 #	include <sys/lock.h>
 #  endif
 # endif
 #endif
 #if defined(HAVE_SCHED_SETSCHEDULER)
 # ifdef HAVE_SCHED_H
 #  include <sched.h>
 # else
 #  ifdef HAVE_SYS_SCHED_H
 #   include <sys/sched.h>
 #  endif
 # endif
 #endif
 #if defined(HAVE_SYS_MMAN_H)
 # include <sys/mman.h>
 #endif
 
 #ifdef HAVE_TERMIOS_H
 # include <termios.h>
 #endif
 
 #ifdef SYS_DOMAINOS
 # include <apollo/base.h>
 #endif /* SYS_DOMAINOS */
 
 
 #include "recvbuff.h"
 #include "ntp_cmdargs.h"
 
 #if 0				/* HMS: I don't think we need this. 961223 */
 #ifdef LOCK_PROCESS
 # ifdef SYS_SOLARIS
 #  include <sys/mman.h>
 # else
 #  include <sys/lock.h>
 # endif
 #endif
 #endif
 
 #ifdef SYS_WINNT
 # include "ntservice.h"
 #endif
 
 #ifdef _AIX
 # include <ulimit.h>
 #endif /* _AIX */
 
 #ifdef SCO5_CLOCK
 # include <sys/ci/ciioctl.h>
 #endif
 
 #ifdef HAVE_DROPROOT
 # include <ctype.h>
 # include <grp.h>
 # include <pwd.h>
 #ifdef HAVE_LINUX_CAPABILITIES
 # include <sys/capability.h>
 # include <sys/prctl.h>
 #endif /* HAVE_LINUX_CAPABILITIES */
 #if defined(HAVE_PRIV_H) && defined(HAVE_SOLARIS_PRIVS)
 # include <priv.h>
 #endif /* HAVE_PRIV_H */
 #if defined(HAVE_TRUSTEDBSD_MAC)
 # include <sys/mac.h>
 #endif /* HAVE_TRUSTEDBSD_MAC */
 #endif /* HAVE_DROPROOT */
 
 #if defined (LIBSECCOMP) && (KERN_SECCOMP)
 /* # include <sys/types.h> */
 # include <sys/resource.h>
 # include <seccomp.h>
 #endif /* LIBSECCOMP and KERN_SECCOMP */
 
 #ifdef HAVE_DNSREGISTRATION
 # include <dns_sd.h>
 DNSServiceRef mdns;
 #endif
 
 #ifdef HAVE_SETPGRP_0
 # define ntp_setpgrp(x, y)	setpgrp()
 #else
 # define ntp_setpgrp(x, y)	setpgrp(x, y)
 #endif
 
 #ifdef HAVE_SOLARIS_PRIVS
 # define LOWPRIVS "basic,sys_time,net_privaddr,proc_setid,!proc_info,!proc_session,!proc_exec"
 static priv_set_t *lowprivs = NULL;
 static priv_set_t *highprivs = NULL;
 #endif /* HAVE_SOLARIS_PRIVS */
 /*
  * Scheduling priority we run at
  */
 #define NTPD_PRIO	(-12)
 
 int priority_done = 2;		/* 0 - Set priority */
 				/* 1 - priority is OK where it is */
 				/* 2 - Don't set priority */
 				/* 1 and 2 are pretty much the same */
 
 int listen_to_virtual_ips = TRUE;
 
 /*
  * No-fork flag.  If set, we do not become a background daemon.
  */
 int nofork;			/* Fork by default */
 
 #ifdef HAVE_DNSREGISTRATION
 /*
  * mDNS registration flag. If set, we attempt to register with the mDNS system, but only
  * after we have synched the first time. If the attempt fails, then try again once per 
  * minute for up to 5 times. After all, we may be starting before mDNS.
  */
 int mdnsreg = FALSE;
 int mdnstries = 5;
 #endif  /* HAVE_DNSREGISTRATION */
 
 #ifdef HAVE_DROPROOT
 int droproot;
 int root_dropped;
 char *user;		/* User to switch to */
 char *group;		/* group to switch to */
 const char *chrootdir;	/* directory to chroot to */
 uid_t sw_uid;
 gid_t sw_gid;
 struct group *gr;
 struct passwd *pw;
 #endif /* HAVE_DROPROOT */
 
 #ifdef HAVE_WORKING_FORK
 int	waitsync_fd_to_close = -1;	/* -w/--wait-sync */
 #endif
 
 /*
  * Version declaration
  */
 extern const char *Version;
 
 char const *progname;
 
 int was_alarmed;
 
 #ifdef DECL_SYSCALL
 /*
  * We put this here, since the argument profile is syscall-specific
  */
 extern int syscall	(int, ...);
 #endif /* DECL_SYSCALL */
 
 
 #if !defined(SIM) && defined(SIGDIE1)
 static volatile int signalled	= 0;
 static volatile int signo	= 0;
 
 /* In an ideal world, 'finish_safe()' would declared as noreturn... */
 static	void		finish_safe	(int);
 static	RETSIGTYPE	finish		(int);
 #endif
 
 #if !defined(SIM) && defined(HAVE_WORKING_FORK)
 static int	wait_child_sync_if	(int, long);
 #endif
 
 #if !defined(SIM) && !defined(SYS_WINNT)
 # ifdef	DEBUG
 static	RETSIGTYPE	moredebug	(int);
 static	RETSIGTYPE	lessdebug	(int);
 # else	/* !DEBUG follows */
 static	RETSIGTYPE	no_debug	(int);
 # endif	/* !DEBUG */
 #endif	/* !SIM && !SYS_WINNT */
 
 #ifndef WORK_FORK
 int	saved_argc;
 char **	saved_argv;
 #endif
 
 #ifndef SIM
 int		ntpdmain		(int, char **);
 static void	set_process_priority	(void);
 static void	assertion_failed	(const char *, int,
 					 isc_assertiontype_t,
 					 const char *)
 			__attribute__	((__noreturn__));
 static void	library_fatal_error	(const char *, int, 
 					 const char *, va_list)
 					ISC_FORMAT_PRINTF(3, 0);
 static void	library_unexpected_error(const char *, int,
 					 const char *, va_list)
 					ISC_FORMAT_PRINTF(3, 0);
 #endif	/* !SIM */
 
 
 /* Bug2332 unearthed a problem in the interaction of reduced user
  * privileges, the limits on memory usage and some versions of the
  * pthread library on Linux systems. The 'pthread_cancel()' function and
  * likely some others need to track the stack of the thread involved,
  * and uses a function that comes from GCC (--> libgcc_s.so) to do
  * this. Unfortunately the developers of glibc decided to load the
  * library on demand, which speeds up program start but can cause
  * trouble here: Due to all the things NTPD does to limit its resource
  * usage, this deferred load of libgcc_s does not always work once the
  * restrictions are in effect.
  *
  * One way out of this was attempting a forced link against libgcc_s
  * when possible because it makes the library available immediately
  * without deferred load. (The symbol resolution would still be dynamic
  * and on demand, but the code would already be in the process image.)
  *
  * This is a tricky thing to do, since it's not necessary everywhere,
  * not possible everywhere, has shown to break the build of other
  * programs in the NTP suite and is now generally frowned upon.
  *
  * So we take a different approach here: We creat a worker thread that does
  * actually nothing except waiting for cancellation and cancel it. If
  * this is done before all the limitations are put in place, the
  * machinery is pre-heated and all the runtime stuff should be in place
  * and useable when needed.
  *
  * This uses only the standard pthread API and should work with all
  * implementations of pthreads. It is not necessary everywhere, but it's
  * cheap enough to go on nearly unnoticed.
  *
  * Addendum: Bug 2954 showed that the assumption that this should work
  * with all OS is wrong -- at least FreeBSD bombs heavily.
  */
 #ifdef NEED_PTHREAD_WARMUP
 
 /* simple thread function: sleep until cancelled, just to exercise
  * thread cancellation.
  */
 static void*
 my_pthread_warmup_worker(
 	void *thread_args)
 {
 	(void)thread_args;
 	for (;;)
 		sleep(10);
 	return NULL;
 }
 	
 /* pre-heat threading: create a thread and cancel it, just to exercise
  * thread cancellation.
  */
 static void
 my_pthread_warmup(void)
 {
 	pthread_t 	thread;
 	pthread_attr_t	thr_attr;
 	int       	rc;
 	
 	pthread_attr_init(&thr_attr);
 #if defined(HAVE_PTHREAD_ATTR_GETSTACKSIZE) && \
     defined(HAVE_PTHREAD_ATTR_SETSTACKSIZE) && \
     defined(PTHREAD_STACK_MIN)
 	{
 		size_t ssmin = 32*1024;	/* 32kB should be minimum */
 		if (ssmin < PTHREAD_STACK_MIN)
 			ssmin = PTHREAD_STACK_MIN;
 		rc = pthread_attr_setstacksize(&thr_attr, ssmin);
 		if (0 != rc)
 			msyslog(LOG_ERR,
 				"my_pthread_warmup: pthread_attr_setstacksize() -> %s",
 				strerror(rc));
 	}
 #endif
 	rc = pthread_create(
 		&thread, &thr_attr, my_pthread_warmup_worker, NULL);
 	pthread_attr_destroy(&thr_attr);
 	if (0 != rc) {
 		msyslog(LOG_ERR,
 			"my_pthread_warmup: pthread_create() -> %s",
 			strerror(rc));
 	} else {
 		pthread_cancel(thread);
 		pthread_join(thread, NULL);
 	}
 }
 
 #endif /*defined(NEED_PTHREAD_WARMUP)*/
 
 #ifdef NEED_EARLY_FORK
 static void
 dummy_callback(void) { return; }
 
 static void
 fork_nonchroot_worker(void) {
 	getaddrinfo_sometime("localhost", "ntp", NULL, INITIAL_DNS_RETRY,
 			     (gai_sometime_callback)&dummy_callback, NULL);
 }
 #endif /* NEED_EARLY_FORK */
 
 void
 parse_cmdline_opts(
 	int *	pargc,
 	char ***pargv
 	)
 {
 	static int	parsed;
 	static int	optct;
 
 	if (!parsed)
 		optct = ntpOptionProcess(&ntpdOptions, *pargc, *pargv);
 
 	parsed = 1;
 	
 	*pargc -= optct;
 	*pargv += optct;
 }
 
 
 #ifdef SIM
 int
 main(
 	int argc,
 	char *argv[]
 	)
 {
 	progname = argv[0];
 	parse_cmdline_opts(&argc, &argv);
 #ifdef DEBUG
 	debug = OPT_VALUE_SET_DEBUG_LEVEL;
 	DPRINTF(1, ("%s\n", Version));
 #endif
 
 	return ntpsim(argc, argv);
 }
 #else	/* !SIM follows */
 #ifdef NO_MAIN_ALLOWED
 CALL(ntpd,"ntpd",ntpdmain);
 #else	/* !NO_MAIN_ALLOWED follows */
 #ifndef SYS_WINNT
 int
 main(
 	int argc,
 	char *argv[]
 	)
 {
 	return ntpdmain(argc, argv);
 }
 #endif /* !SYS_WINNT */
 #endif /* !NO_MAIN_ALLOWED */
 #endif /* !SIM */
 
 #ifdef _AIX
 /*
  * OK. AIX is different than solaris in how it implements plock().
  * If you do NOT adjust the stack limit, you will get the MAXIMUM
  * stack size allocated and PINNED with you program. To check the
  * value, use ulimit -a.
  *
  * To fix this, we create an automatic variable and set our stack limit
  * to that PLUS 32KB of extra space (we need some headroom).
  *
  * This subroutine gets the stack address.
  *
  * Grover Davidson and Matt Ladendorf
  *
  */
 static char *
 get_aix_stack(void)
 {
 	char ch;
 	return (&ch);
 }
 
 /*
  * Signal handler for SIGDANGER.
  */
 static void
 catch_danger(int signo)
 {
 	msyslog(LOG_INFO, "ntpd: setpgid(): %m");
 	/* Make the system believe we'll free something, but don't do it! */
 	return;
 }
 #endif /* _AIX */
 
 /*
  * Set the process priority
  */
 #ifndef SIM
 static void
 set_process_priority(void)
 {
 
 # ifdef DEBUG
 	if (debug > 1)
 		msyslog(LOG_DEBUG, "set_process_priority: %s: priority_done is <%d>",
 			((priority_done)
 			 ? "Leave priority alone"
 			 : "Attempt to set priority"
 				),
 			priority_done);
 # endif /* DEBUG */
 
 # if defined(HAVE_SCHED_SETSCHEDULER)
 	if (!priority_done) {
 		extern int config_priority_override, config_priority;
 		int pmax, pmin;
 		struct sched_param sched;
 
 		pmax = sched_get_priority_max(SCHED_FIFO);
 		sched.sched_priority = pmax;
 		if ( config_priority_override ) {
 			pmin = sched_get_priority_min(SCHED_FIFO);
 			if ( config_priority > pmax )
 				sched.sched_priority = pmax;
 			else if ( config_priority < pmin )
 				sched.sched_priority = pmin;
 			else
 				sched.sched_priority = config_priority;
 		}
 		if ( sched_setscheduler(0, SCHED_FIFO, &sched) == -1 )
 			msyslog(LOG_ERR, "sched_setscheduler(): %m");
 		else
 			++priority_done;
 	}
 # endif /* HAVE_SCHED_SETSCHEDULER */
 # ifdef HAVE_RTPRIO
 #  ifdef RTP_SET
 	if (!priority_done) {
 		struct rtprio srtp;
 
 		srtp.type = RTP_PRIO_REALTIME;	/* was: RTP_PRIO_NORMAL */
 		srtp.prio = 0;		/* 0 (hi) -> RTP_PRIO_MAX (31,lo) */
 
 		if (rtprio(RTP_SET, getpid(), &srtp) < 0)
 			msyslog(LOG_ERR, "rtprio() error: %m");
 		else
 			++priority_done;
 	}
 #  else	/* !RTP_SET follows */
 	if (!priority_done) {
 		if (rtprio(0, 120) < 0)
 			msyslog(LOG_ERR, "rtprio() error: %m");
 		else
 			++priority_done;
 	}
 #  endif	/* !RTP_SET */
 # endif	/* HAVE_RTPRIO */
 # if defined(NTPD_PRIO) && NTPD_PRIO != 0
 #  ifdef HAVE_ATT_NICE
 	if (!priority_done) {
 		errno = 0;
 		if (-1 == nice (NTPD_PRIO) && errno != 0)
 			msyslog(LOG_ERR, "nice() error: %m");
 		else
 			++priority_done;
 	}
 #  endif	/* HAVE_ATT_NICE */
 #  ifdef HAVE_BSD_NICE
 	if (!priority_done) {
 		if (-1 == setpriority(PRIO_PROCESS, 0, NTPD_PRIO))
 			msyslog(LOG_ERR, "setpriority() error: %m");
 		else
 			++priority_done;
 	}
 #  endif	/* HAVE_BSD_NICE */
 # endif	/* NTPD_PRIO && NTPD_PRIO != 0 */
 	if (!priority_done)
 		msyslog(LOG_ERR, "set_process_priority: No way found to improve our priority");
 }
 #endif	/* !SIM */
 
 #if !defined(SIM) && !defined(SYS_WINNT)
 /*
  * Detach from terminal (much like daemon())
  * Nothe that this function calls exit()
  */
 # ifdef HAVE_WORKING_FORK
 static void
 detach_from_terminal(
 	int pipe_fds[2],
 	long wait_sync,
 	const char *logfilename
 	)
 {
 	int rc;
 	int exit_code;
 #  if !defined(HAVE_SETSID) && !defined (HAVE_SETPGID) && defined(TIOCNOTTY)
 	int		fid;
 #  endif
 #  ifdef _AIX
 	struct sigaction sa;
 #  endif
 
 	rc = fork();
 	if (-1 == rc) {
 		exit_code = (errno) ? errno : -1;
 		msyslog(LOG_ERR, "fork: %m");
 		exit(exit_code);
 	}
 	if (rc > 0) {
 		/* parent */
 		exit_code = wait_child_sync_if(pipe_fds[0],
 					       wait_sync);
 		exit(exit_code);
 	}
 
 	/*
 	 * child/daemon
 	 * close all open files excepting waitsync_fd_to_close.
 	 * msyslog() unreliable until after init_logging().
 	 */
 	closelog();
 	if (syslog_file != NULL) {
 		fclose(syslog_file);
 		syslog_file = NULL;
 		syslogit = TRUE;
 	}
 	close_all_except(waitsync_fd_to_close);
 	INSIST(0 == open("/dev/null", 0) && 1 == dup2(0, 1) \
 		&& 2 == dup2(0, 2));
 
 	init_logging(progname, 0, TRUE);
 	/* we lost our logfile (if any) daemonizing */
 	setup_logfile(logfilename);
 
 #  ifdef SYS_DOMAINOS
 	{
 		uid_$t puid;
 		status_$t st;
 
 		proc2_$who_am_i(&puid);
 		proc2_$make_server(&puid, &st);
 	}
 #  endif	/* SYS_DOMAINOS */
 #  ifdef HAVE_SETSID
 	if (setsid() == (pid_t)-1)
 		msyslog(LOG_ERR, "setsid(): %m");
 #  elif defined(HAVE_SETPGID)
 	if (setpgid(0, 0) == -1)
 		msyslog(LOG_ERR, "setpgid(): %m");
 #  else		/* !HAVE_SETSID && !HAVE_SETPGID follows */
 #   ifdef TIOCNOTTY
 	fid = open("/dev/tty", 2);
 	if (fid >= 0) {
 		ioctl(fid, (u_long)TIOCNOTTY, NULL);
 		close(fid);
 	}
 #   endif	/* TIOCNOTTY */
 	ntp_setpgrp(0, getpid());
 #  endif	/* !HAVE_SETSID && !HAVE_SETPGID */
 #  ifdef _AIX
 	/* Don't get killed by low-on-memory signal. */
 	sa.sa_handler = catch_danger;
 	sigemptyset(&sa.sa_mask);
 	sa.sa_flags = SA_RESTART;
 	sigaction(SIGDANGER, &sa, NULL);
 #  endif	/* _AIX */
 
 	return;
 }
 # endif /* HAVE_WORKING_FORK */
 
 #ifdef HAVE_DROPROOT
 /*
  * Map user name/number to user ID
 */
 static int
 map_user(
 	)
 {
 	char *endp;
 
 	if (isdigit((unsigned char)*user)) {
 		sw_uid = (uid_t)strtoul(user, &endp, 0);
 		if (*endp != '\0')
 			goto getuser;
 
 		if ((pw = getpwuid(sw_uid)) != NULL) {
 			free(user);
 			user = estrdup(pw->pw_name);
 			sw_gid = pw->pw_gid;
 		} else {
 			errno = 0;
 			msyslog(LOG_ERR, "Cannot find user ID %s", user);
 			return 0;
 		}
 
 	} else {
 getuser:
 		errno = 0;
 		if ((pw = getpwnam(user)) != NULL) {
 			sw_uid = pw->pw_uid;
 			sw_gid = pw->pw_gid;
 		} else {
 			if (errno)
 				msyslog(LOG_ERR, "getpwnam(%s) failed: %m", user);
 			else
 				msyslog(LOG_ERR, "Cannot find user `%s'", user);
 			return 0;
 		}
 	}
 
 	return 1;
 }
 
 /*
  * Map group name/number to group ID
 */
 static int
 map_group(void)
 {
 	char *endp;
 
 	if (isdigit((unsigned char)*group)) {
 		sw_gid = (gid_t)strtoul(group, &endp, 0);
 		if (*endp != '\0')
 			goto getgroup;
 	} else {
 getgroup:
 		if ((gr = getgrnam(group)) != NULL) {
 			sw_gid = gr->gr_gid;
 		} else {
 			errno = 0;
 			msyslog(LOG_ERR, "Cannot find group `%s'", group);
 			return 0;
 		}
 	}
 
 	return 1;
 }
 
 static int
 set_group_ids(void)
 {
 	if (user && initgroups(user, sw_gid)) {
 		msyslog(LOG_ERR, "Cannot initgroups() to user `%s': %m", user);
 		return 0;
 	}
 	if (group && setgid(sw_gid)) {
 		msyslog(LOG_ERR, "Cannot setgid() to group `%s': %m", group);
 		return 0;
 	}
 	if (group && setegid(sw_gid)) {
 		msyslog(LOG_ERR, "Cannot setegid() to group `%s': %m", group);
 		return 0;
 	}
 	if (group) {
 		if (0 != setgroups(1, &sw_gid)) {
 			msyslog(LOG_ERR, "setgroups(1, %d) failed: %m", sw_gid);
 			return 0;
 		}
 	}
 	else if (pw)
 		if (0 != initgroups(pw->pw_name, pw->pw_gid)) {
 			msyslog(LOG_ERR, "initgroups(<%s>, %d) filed: %m", pw->pw_name, pw->pw_gid);
 			return 0;
 		}
 	return 1;
 }
 
 static int
 set_user_ids(void)
 {
 	if (user && setuid(sw_uid)) {
 		msyslog(LOG_ERR, "Cannot setuid() to user `%s': %m", user);
 		return 0;
 	}
 	if (user && seteuid(sw_uid)) {
 		msyslog(LOG_ERR, "Cannot seteuid() to user `%s': %m", user);
 		return 0;
 	}
 	return 1;
 }
 
 /*
  * Change (effective) user and group IDs, also initialize the supplementary group access list
  */
 int set_user_group_ids(void);
 int
 set_user_group_ids(void)
 {
 	/* If the the user was already mapped, no need to map it again */
 	if ((NULL != user) && (0 == sw_uid)) {
 		if (0 == map_user())
 			exit (-1);
 	}
 	/* same applies for the group */
 	if ((NULL != group) && (0 == sw_gid)) {
 		if (0 == map_group())
 			exit (-1);
 	}
 
 	if (getegid() != sw_gid && 0 == set_group_ids())
 		return 0;
 	if (geteuid() != sw_uid && 0 == set_user_ids())
 		return 0;
 
 	return 1;
 }
 #endif /* HAVE_DROPROOT */
 #endif /* !SIM */
 
 /*
  * Main program.  Initialize us, disconnect us from the tty if necessary,
  * and loop waiting for I/O and/or timer expiries.
  */
 #ifndef SIM
 int
 ntpdmain(
 	int argc,
 	char *argv[]
 	)
 {
 	l_fp		now;
 	struct recvbuf *rbuf;
 	const char *	logfilename;
 # ifdef HAVE_UMASK
 	mode_t		uv;
 # endif
 # if defined(HAVE_GETUID) && !defined(MPE) /* MPE lacks the concept of root */
 	uid_t		uid;
 # endif
 # if defined(HAVE_WORKING_FORK)
 	long		wait_sync = 0;
 	int		pipe_fds[2];
 	int		rc;
 	int		exit_code;
 # endif	/* HAVE_WORKING_FORK*/
 # ifdef SCO5_CLOCK
 	int		fd;
 	int		zero;
 # endif
 
 # ifdef NEED_PTHREAD_WARMUP
 	my_pthread_warmup();
 # endif
 	
 # ifdef HAVE_UMASK
 	uv = umask(0);
 	if (uv)
 		umask(uv);
 	else
 		umask(022);
 # endif
 	saved_argc = argc;
 	saved_argv = argv;
 	progname = argv[0];
 	initializing = TRUE;		/* mark that we are initializing */
 	parse_cmdline_opts(&argc, &argv);
 # ifdef DEBUG
 	debug = OPT_VALUE_SET_DEBUG_LEVEL;
 #  ifdef HAVE_SETLINEBUF
 	setlinebuf(stdout);
 #  endif
 # endif
 
 	if (HAVE_OPT(NOFORK) || HAVE_OPT(QUIT)
 # ifdef DEBUG
 	    || debug
 # endif
 	    || HAVE_OPT(SAVECONFIGQUIT))
 		nofork = TRUE;
 
 	init_logging(progname, NLOG_SYNCMASK, TRUE);
 	/* honor -l/--logfile option to log to a file */
 	if (HAVE_OPT(LOGFILE)) {
 		logfilename = OPT_ARG(LOGFILE);
 		syslogit = FALSE;
 		change_logfile(logfilename, FALSE);
 	} else {
 		logfilename = NULL;
 		if (nofork)
 			msyslog_term = TRUE;
 		if (HAVE_OPT(SAVECONFIGQUIT))
 			syslogit = FALSE;
 	}
 	msyslog(LOG_NOTICE, "%s: Starting", Version);
 
 	{
 		int i;
 		char buf[1024];	/* Secret knowledge of msyslog buf length */
 		char *cp = buf;
 
 		/* Note that every arg has an initial space character */
 		snprintf(cp, sizeof(buf), "Command line:");
 		cp += strlen(cp);
 
 		for (i = 0; i < saved_argc ; ++i) {
 			snprintf(cp, sizeof(buf) - (cp - buf),
 				" %s", saved_argv[i]);
 			cp += strlen(cp);
 		}
 		msyslog(LOG_INFO, "%s", buf);
 	}
 
 	/*
 	 * Install trap handlers to log errors and assertion failures.
 	 * Default handlers print to stderr which doesn't work if detached.
 	 */
 	isc_assertion_setcallback(assertion_failed);
 	isc_error_setfatal(library_fatal_error);
 	isc_error_setunexpected(library_unexpected_error);
 
 	/* MPE lacks the concept of root */
 # if defined(HAVE_GETUID) && !defined(MPE)
 	uid = getuid();
 	if (uid && !HAVE_OPT( SAVECONFIGQUIT )
 #  if defined(HAVE_TRUSTEDBSD_MAC)
 	    /* We can run as non-root if the mac_ntpd policy is enabled. */
 	    && mac_is_present("ntpd") != 1
 #  endif
 	    ) {
 		msyslog_term = TRUE;
 		msyslog(LOG_ERR,
 			"must be run as root, not uid %ld", (long)uid);
 		exit(1);
 	}
 # endif
 
 /*
  * Enable the Multi-Media Timer for Windows?
  */
 # ifdef SYS_WINNT
 	if (HAVE_OPT( MODIFYMMTIMER ))
 		set_mm_timer(MM_TIMER_HIRES);
 # endif
 
 #ifdef HAVE_DNSREGISTRATION
 /*
  * Enable mDNS registrations?
  */
 	if (HAVE_OPT( MDNS )) {
 		mdnsreg = TRUE;
 	}
 #endif  /* HAVE_DNSREGISTRATION */
 
 	if (HAVE_OPT( NOVIRTUALIPS ))
 		listen_to_virtual_ips = 0;
 
 	/*
 	 * --interface, listen on specified interfaces
 	 */
 	if (HAVE_OPT( INTERFACE )) {
 		int		ifacect = STACKCT_OPT( INTERFACE );
 		const char**	ifaces  = STACKLST_OPT( INTERFACE );
 		sockaddr_u	addr;
 
 		while (ifacect-- > 0) {
 			add_nic_rule(
 				is_ip_address(*ifaces, AF_UNSPEC, &addr)
 					? MATCH_IFADDR
 					: MATCH_IFNAME,
 				*ifaces, -1, ACTION_LISTEN);
 			ifaces++;
 		}
 	}
 
 	if (HAVE_OPT( NICE ))
 		priority_done = 0;
 
 # ifdef HAVE_SCHED_SETSCHEDULER
 	if (HAVE_OPT( PRIORITY )) {
 		config_priority = OPT_VALUE_PRIORITY;
 		config_priority_override = 1;
 		priority_done = 0;
 	}
 # endif
 
 # ifdef HAVE_WORKING_FORK
 	/* make sure the FDs are initialised */
 	pipe_fds[0] = -1;
 	pipe_fds[1] = -1;
 	do {					/* 'loop' once */
 		if (!HAVE_OPT( WAIT_SYNC ))
 			break;
 		wait_sync = OPT_VALUE_WAIT_SYNC;
 		if (wait_sync <= 0) {
 			wait_sync = 0;
 			break;
 		}
 		/* -w requires a fork() even with debug > 0 */
 		nofork = FALSE;
 		if (pipe(pipe_fds)) {
 			exit_code = (errno) ? errno : -1;
 			msyslog(LOG_ERR,
 				"Pipe creation failed for --wait-sync: %m");
 			exit(exit_code);
 		}
 		waitsync_fd_to_close = pipe_fds[1];
 	} while (0);				/* 'loop' once */
 # endif	/* HAVE_WORKING_FORK */
 
 	init_lib();
 # ifdef SYS_WINNT
 	/*
 	 * Make sure the service is initialized before we do anything else
 	 */
 	ntservice_init();
 
 	/*
 	 * Start interpolation thread, must occur before first
 	 * get_systime()
 	 */
 	init_winnt_time();
 # endif
 	/*
 	 * Initialize random generator and public key pair
 	 */
 	get_systime(&now);
 
 	ntp_srandom((int)(now.l_i * now.l_uf));
 
 	/*
 	 * Detach us from the terminal.  May need an #ifndef GIZMO.
 	 */
 	if (!nofork) {
 
 # ifdef HAVE_WORKING_FORK
 		detach_from_terminal(pipe_fds, wait_sync, logfilename);
 # endif		/* HAVE_WORKING_FORK */
 	}
 
 # ifdef SCO5_CLOCK
 	/*
 	 * SCO OpenServer's system clock offers much more precise timekeeping
 	 * on the base CPU than the other CPUs (for multiprocessor systems),
 	 * so we must lock to the base CPU.
 	 */
 	fd = open("/dev/at1", O_RDONLY);		
 	if (fd >= 0) {
 		zero = 0;
 		if (ioctl(fd, ACPU_LOCK, &zero) < 0)
 			msyslog(LOG_ERR, "cannot lock to base CPU: %m");
 		close(fd);
 	}
 # endif
 
 	/* Setup stack size in preparation for locking pages in memory. */
 # if defined(HAVE_MLOCKALL)
 #  ifdef HAVE_SETRLIMIT
 	ntp_rlimit(RLIMIT_STACK, DFLT_RLIMIT_STACK * 4096, 4096, "4k");
-#   ifdef RLIMIT_MEMLOCK
+#   if defined(RLIMIT_MEMLOCK) && defined(DFLT_RLIMIT_MEMLOCK) && DFLT_RLIMIT_MEMLOCK != -1
 	/*
 	 * The default RLIMIT_MEMLOCK is very low on Linux systems.
 	 * Unless we increase this limit malloc calls are likely to
 	 * fail if we drop root privilege.  To be useful the value
 	 * has to be larger than the largest ntpd resident set size.
 	 */
 	ntp_rlimit(RLIMIT_MEMLOCK, DFLT_RLIMIT_MEMLOCK * 1024 * 1024, 1024 * 1024, "MB");
 #   endif	/* RLIMIT_MEMLOCK */
 #  endif	/* HAVE_SETRLIMIT */
 # else	/* !HAVE_MLOCKALL follows */
 #  ifdef HAVE_PLOCK
 #   ifdef PROCLOCK
 #    ifdef _AIX
 	/*
 	 * set the stack limit for AIX for plock().
 	 * see get_aix_stack() for more info.
 	 */
 	if (ulimit(SET_STACKLIM, (get_aix_stack() - 8 * 4096)) < 0)
 		msyslog(LOG_ERR,
 			"Cannot adjust stack limit for plock: %m");
 #    endif	/* _AIX */
 #   endif	/* PROCLOCK */
 #  endif	/* HAVE_PLOCK */
 # endif	/* !HAVE_MLOCKALL */
 
 	/*
 	 * Set up signals we pay attention to locally.
 	 */
 # ifdef SIGDIE1
 	signal_no_reset(SIGDIE1, finish);
 	signal_no_reset(SIGDIE2, finish);
 	signal_no_reset(SIGDIE3, finish);
 	signal_no_reset(SIGDIE4, finish);
 # endif
 # ifdef SIGBUS
 	signal_no_reset(SIGBUS, finish);
 # endif
 
 # if !defined(SYS_WINNT) && !defined(VMS)
 #  ifdef DEBUG
 	(void) signal_no_reset(MOREDEBUGSIG, moredebug);
 	(void) signal_no_reset(LESSDEBUGSIG, lessdebug);
 #  else
 	(void) signal_no_reset(MOREDEBUGSIG, no_debug);
 	(void) signal_no_reset(LESSDEBUGSIG, no_debug);
 #  endif	/* DEBUG */
 # endif	/* !SYS_WINNT && !VMS */
 
 	/*
 	 * Set up signals we should never pay attention to.
 	 */
 # ifdef SIGPIPE
 	signal_no_reset(SIGPIPE, SIG_IGN);
 # endif
 
 	/*
 	 * Call the init_ routines to initialize the data structures.
 	 *
 	 * Exactly what command-line options are we expecting here?
 	 */
 	INIT_SSL();
 	init_auth();
 	init_util();
 	init_restrict();
 	init_mon();
 	init_timer();
 	init_request();
 	init_control();
 	init_peer();
 # ifdef REFCLOCK
 	init_refclock();
 # endif
 	set_process_priority();
 	init_proto();		/* Call at high priority */
 	init_io();
 	init_loopfilter();
 	mon_start(MON_ON);	/* monitor on by default now	  */
 				/* turn off in config if unwanted */
 
 	/*
 	 * Get the configuration.  This is done in a separate module
 	 * since this will definitely be different for the gizmo board.
 	 */
 	getconfig(argc, argv);
 
 	if (-1 == cur_memlock) {
 # if defined(HAVE_MLOCKALL)
 		/*
 		 * lock the process into memory
 		 */
 		if (   !HAVE_OPT(SAVECONFIGQUIT)
 #  ifdef RLIMIT_MEMLOCK
 		    && -1 != DFLT_RLIMIT_MEMLOCK
 #  endif
 		    && 0 != mlockall(MCL_CURRENT|MCL_FUTURE))
 			msyslog(LOG_ERR, "mlockall(): %m");
 # else	/* !HAVE_MLOCKALL follows */
 #  ifdef HAVE_PLOCK
 #   ifdef PROCLOCK
 		/*
 		 * lock the process into memory
 		 */
 		if (!HAVE_OPT(SAVECONFIGQUIT) && 0 != plock(PROCLOCK))
 			msyslog(LOG_ERR, "plock(PROCLOCK): %m");
 #   else	/* !PROCLOCK follows  */
 #    ifdef TXTLOCK
 		/*
 		 * Lock text into ram
 		 */
 		if (!HAVE_OPT(SAVECONFIGQUIT) && 0 != plock(TXTLOCK))
 			msyslog(LOG_ERR, "plock(TXTLOCK) error: %m");
 #    else	/* !TXTLOCK follows */
 		msyslog(LOG_ERR, "plock() - don't know what to lock!");
 #    endif	/* !TXTLOCK */
 #   endif	/* !PROCLOCK */
 #  endif	/* HAVE_PLOCK */
 # endif	/* !HAVE_MLOCKALL */
 	}
 
 	loop_config(LOOP_DRIFTINIT, 0);
 	report_event(EVNT_SYSRESTART, NULL, NULL);
 	initializing = FALSE;
 
 # ifdef HAVE_DROPROOT
 	if (droproot) {
 
 #ifdef NEED_EARLY_FORK
 		fork_nonchroot_worker();
 #endif
 
 		/* Drop super-user privileges and chroot now if the OS supports this */
 
 #  ifdef HAVE_LINUX_CAPABILITIES
 		/* set flag: keep privileges accross setuid() call (we only really need cap_sys_time): */
 		if (prctl( PR_SET_KEEPCAPS, 1L, 0L, 0L, 0L ) == -1) {
 			msyslog( LOG_ERR, "prctl( PR_SET_KEEPCAPS, 1L ) failed: %m" );
 			exit(-1);
 		}
 #  elif HAVE_SOLARIS_PRIVS
 		/* Nothing to do here */
 #  else
 		/* we need a user to switch to */
 		if (user == NULL) {
 			msyslog(LOG_ERR, "Need user name to drop root privileges (see -u flag!)" );
 			exit(-1);
 		}
 #  endif	/* HAVE_LINUX_CAPABILITIES || HAVE_SOLARIS_PRIVS */
 
 		if (user != NULL) {
 			if (0 == map_user())
 				exit (-1);
 		}
 		if (group != NULL) {
 			if (0 == map_group())
 				exit (-1);
 		}
 
 		if (chrootdir ) {
 			/* make sure cwd is inside the jail: */
 			if (chdir(chrootdir)) {
 				msyslog(LOG_ERR, "Cannot chdir() to `%s': %m", chrootdir);
 				exit (-1);
 			}
 			if (chroot(chrootdir)) {
 				msyslog(LOG_ERR, "Cannot chroot() to `%s': %m", chrootdir);
 				exit (-1);
 			}
 			if (chdir("/")) {
 				msyslog(LOG_ERR, "Cannot chdir() to`root after chroot(): %m");
 				exit (-1);
 			}
 		}
 #  ifdef HAVE_SOLARIS_PRIVS
 		if ((lowprivs = priv_str_to_set(LOWPRIVS, ",", NULL)) == NULL) {
 			msyslog(LOG_ERR, "priv_str_to_set() failed:%m");
 			exit(-1);
 		}
 		if ((highprivs = priv_allocset()) == NULL) {
 			msyslog(LOG_ERR, "priv_allocset() failed:%m");
 			exit(-1);
 		}
 		(void) getppriv(PRIV_PERMITTED, highprivs);
 		(void) priv_intersect(highprivs, lowprivs);
 		if (setppriv(PRIV_SET, PRIV_PERMITTED, lowprivs) == -1) {
 			msyslog(LOG_ERR, "setppriv() failed:%m");
 			exit(-1);
 		}
 #  endif /* HAVE_SOLARIS_PRIVS */
 		if (0 == set_user_group_ids())
 			exit(-1);
 
 #  if defined(HAVE_TRUSTEDBSD_MAC)
 		/*
 		 * To manipulate system time and (re-)bind to NTP_PORT as needed
 		 * following interface changes, we must either run as uid 0 or
 		 * the mac_ntpd policy module must be enabled.
 		 */
 		if (sw_uid != 0 && mac_is_present("ntpd") != 1) {
 			msyslog(LOG_ERR, "Need MAC 'ntpd' policy enabled to drop root privileges");
 			exit (-1);
 		}
 #  elif !defined(HAVE_LINUX_CAPABILITIES) && !defined(HAVE_SOLARIS_PRIVS)
 		/*
 		 * for now assume that the privilege to bind to privileged ports
 		 * is associated with running with uid 0 - should be refined on
 		 * ports that allow binding to NTP_PORT with uid != 0
 		 */
 		disable_dynamic_updates |= (sw_uid != 0);  /* also notifies routing message listener */
 #  endif /* !HAVE_LINUX_CAPABILITIES && !HAVE_SOLARIS_PRIVS */
 
 		if (disable_dynamic_updates && interface_interval) {
 			interface_interval = 0;
 			msyslog(LOG_INFO, "running as non-root disables dynamic interface tracking");
 		}
 
 #  ifdef HAVE_LINUX_CAPABILITIES
 		{
 			/*
 			 *  We may be running under non-root uid now, but we still hold full root privileges!
 			 *  We drop all of them, except for the crucial one or two: cap_sys_time and
 			 *  cap_net_bind_service if doing dynamic interface tracking.
 			 */
 			cap_t caps;
 			char *captext;
 			
 			captext = (0 != interface_interval)
 				      ? "cap_sys_time,cap_net_bind_service=pe"
 				      : "cap_sys_time=pe";
 			caps = cap_from_text(captext);
 			if (!caps) {
 				msyslog(LOG_ERR,
 					"cap_from_text(%s) failed: %m",
 					captext);
 				exit(-1);
 			}
 			if (-1 == cap_set_proc(caps)) {
 				msyslog(LOG_ERR,
 					"cap_set_proc() failed to drop root privs: %m");
 				exit(-1);
 			}
 			cap_free(caps);
 		}
 #  endif	/* HAVE_LINUX_CAPABILITIES */
 #  ifdef HAVE_SOLARIS_PRIVS
 		if (priv_delset(lowprivs, "proc_setid") == -1) {
 			msyslog(LOG_ERR, "priv_delset() failed:%m");
 			exit(-1);
 		}
 		if (setppriv(PRIV_SET, PRIV_PERMITTED, lowprivs) == -1) {
 			msyslog(LOG_ERR, "setppriv() failed:%m");
 			exit(-1);
 		}
 		priv_freeset(lowprivs);
 		priv_freeset(highprivs);
 #  endif /* HAVE_SOLARIS_PRIVS */
 		root_dropped = TRUE;
 		fork_deferred_worker();
 	}	/* if (droproot) */
 # endif	/* HAVE_DROPROOT */
 
 /* libssecomp sandboxing */
 #if defined (LIBSECCOMP) && (KERN_SECCOMP)
 	scmp_filter_ctx ctx;
 
 	if ((ctx = seccomp_init(SCMP_ACT_KILL)) < 0)
 		msyslog(LOG_ERR, "%s: seccomp_init(SCMP_ACT_KILL) failed: %m", __func__);
 	else {
 		msyslog(LOG_DEBUG, "%s: seccomp_init(SCMP_ACT_KILL) succeeded", __func__);
 	}
 
 #ifdef __x86_64__
 int scmp_sc[] = {
 	SCMP_SYS(adjtimex),
 	SCMP_SYS(bind),
 	SCMP_SYS(brk),
 	SCMP_SYS(chdir),
 	SCMP_SYS(clock_gettime),
 	SCMP_SYS(clock_settime),
 	SCMP_SYS(close),
 	SCMP_SYS(connect),
 	SCMP_SYS(exit_group),
 	SCMP_SYS(fstat),
 	SCMP_SYS(fsync),
 	SCMP_SYS(futex),
 	SCMP_SYS(getitimer),
 	SCMP_SYS(getsockname),
 	SCMP_SYS(ioctl),
 	SCMP_SYS(lseek),
 	SCMP_SYS(madvise),
 	SCMP_SYS(mmap),
 	SCMP_SYS(munmap),
 	SCMP_SYS(open),
 	SCMP_SYS(poll),
 	SCMP_SYS(read),
 	SCMP_SYS(recvmsg),
 	SCMP_SYS(rename),
 	SCMP_SYS(rt_sigaction),
 	SCMP_SYS(rt_sigprocmask),
 	SCMP_SYS(rt_sigreturn),
 	SCMP_SYS(select),
 	SCMP_SYS(sendto),
 	SCMP_SYS(setitimer),
 	SCMP_SYS(setsid),
 	SCMP_SYS(socket),
 	SCMP_SYS(stat),
 	SCMP_SYS(time),
 	SCMP_SYS(write),
 };
 #endif
 #ifdef __i386__
 int scmp_sc[] = {
 	SCMP_SYS(_newselect),
 	SCMP_SYS(adjtimex),
 	SCMP_SYS(brk),
 	SCMP_SYS(chdir),
 	SCMP_SYS(clock_gettime),
 	SCMP_SYS(clock_settime),
 	SCMP_SYS(close),
 	SCMP_SYS(exit_group),
 	SCMP_SYS(fsync),
 	SCMP_SYS(futex),
 	SCMP_SYS(getitimer),
 	SCMP_SYS(madvise),
 	SCMP_SYS(mmap),
 	SCMP_SYS(mmap2),
 	SCMP_SYS(munmap),
 	SCMP_SYS(open),
 	SCMP_SYS(poll),
 	SCMP_SYS(read),
 	SCMP_SYS(rename),
 	SCMP_SYS(rt_sigaction),
 	SCMP_SYS(rt_sigprocmask),
 	SCMP_SYS(select),
 	SCMP_SYS(setitimer),
 	SCMP_SYS(setsid),
 	SCMP_SYS(sigprocmask),
 	SCMP_SYS(sigreturn),
 	SCMP_SYS(socketcall),
 	SCMP_SYS(stat64),
 	SCMP_SYS(time),
 	SCMP_SYS(write),
 };
 #endif
 	{
 		int i;
 
 		for (i = 0; i < COUNTOF(scmp_sc); i++) {
 			if (seccomp_rule_add(ctx,
 			    SCMP_ACT_ALLOW, scmp_sc[i], 0) < 0) {
 				msyslog(LOG_ERR,
 				    "%s: seccomp_rule_add() failed: %m",
 				    __func__);
 			}
 		}
 	}
 
 	if (seccomp_load(ctx) < 0)
 		msyslog(LOG_ERR, "%s: seccomp_load() failed: %m",
 		    __func__);	
 	else {
 		msyslog(LOG_DEBUG, "%s: seccomp_load() succeeded", __func__);
 	}
 #endif /* LIBSECCOMP and KERN_SECCOMP */
 
 #ifdef SYS_WINNT
 	ntservice_isup();
 #endif
 
 # ifdef HAVE_IO_COMPLETION_PORT
 
 	for (;;) {
 #if !defined(SIM) && defined(SIGDIE1)
 		if (signalled)
 			finish_safe(signo);
 #endif
 		GetReceivedBuffers();
 # else /* normal I/O */
 
 	BLOCK_IO_AND_ALARM();
 	was_alarmed = FALSE;
 
 	for (;;) {
 #if !defined(SIM) && defined(SIGDIE1)
 		if (signalled)
 			finish_safe(signo);
 #endif		
 		if (alarm_flag) {	/* alarmed? */
 			was_alarmed = TRUE;
 			alarm_flag = FALSE;
 		}
 
 		/* collect async name/addr results */
 		if (!was_alarmed)
 		    harvest_blocking_responses();
 		
 		if (!was_alarmed && !has_full_recv_buffer()) {
 			/*
 			 * Nothing to do.  Wait for something.
 			 */
 			io_handler();
 		}
 
 		if (alarm_flag) {	/* alarmed? */
 			was_alarmed = TRUE;
 			alarm_flag = FALSE;
 		}
 
 		if (was_alarmed) {
 			UNBLOCK_IO_AND_ALARM();
 			/*
 			 * Out here, signals are unblocked.  Call timer routine
 			 * to process expiry.
 			 */
 			timer();
 			was_alarmed = FALSE;
 			BLOCK_IO_AND_ALARM();
 		}
 
 # endif		/* !HAVE_IO_COMPLETION_PORT */
 
 # ifdef DEBUG_TIMING
 		{
 			l_fp pts;
 			l_fp tsa, tsb;
 			int bufcount = 0;
 
 			get_systime(&pts);
 			tsa = pts;
 # endif
 			rbuf = get_full_recv_buffer();
 			while (rbuf != NULL) {
 				if (alarm_flag) {
 					was_alarmed = TRUE;
 					alarm_flag = FALSE;
 				}
 				UNBLOCK_IO_AND_ALARM();
 
 				if (was_alarmed) {
 					/* avoid timer starvation during lengthy I/O handling */
 					timer();
 					was_alarmed = FALSE;
 				}
 
 				/*
 				 * Call the data procedure to handle each received
 				 * packet.
 				 */
 				if (rbuf->receiver != NULL) {
 # ifdef DEBUG_TIMING
 					l_fp dts = pts;
 
 					L_SUB(&dts, &rbuf->recv_time);
 					DPRINTF(2, ("processing timestamp delta %s (with prec. fuzz)\n", lfptoa(&dts, 9)));
 					collect_timing(rbuf, "buffer processing delay", 1, &dts);
 					bufcount++;
 # endif
 					(*rbuf->receiver)(rbuf);
 				} else {
 					msyslog(LOG_ERR, "fatal: receive buffer callback NULL");
 					abort();
 				}
 
 				BLOCK_IO_AND_ALARM();
 				freerecvbuf(rbuf);
 				rbuf = get_full_recv_buffer();
 			}
 # ifdef DEBUG_TIMING
 			get_systime(&tsb);
 			L_SUB(&tsb, &tsa);
 			if (bufcount) {
 				collect_timing(NULL, "processing", bufcount, &tsb);
 				DPRINTF(2, ("processing time for %d buffers %s\n", bufcount, lfptoa(&tsb, 9)));
 			}
 		}
 # endif
 
 		/*
 		 * Go around again
 		 */
 
 # ifdef HAVE_DNSREGISTRATION
 		if (mdnsreg && (current_time - mdnsreg ) > 60 && mdnstries && sys_leap != LEAP_NOTINSYNC) {
 			mdnsreg = current_time;
 			msyslog(LOG_INFO, "Attempting to register mDNS");
 			if ( DNSServiceRegister (&mdns, 0, 0, NULL, "_ntp._udp", NULL, NULL, 
 			    htons(NTP_PORT), 0, NULL, NULL, NULL) != kDNSServiceErr_NoError ) {
 				if (!--mdnstries) {
 					msyslog(LOG_ERR, "Unable to register mDNS, giving up.");
 				} else {	
 					msyslog(LOG_INFO, "Unable to register mDNS, will try later.");
 				}
 			} else {
 				msyslog(LOG_INFO, "mDNS service registered.");
 				mdnsreg = FALSE;
 			}
 		}
 # endif /* HAVE_DNSREGISTRATION */
 
 	}
 	UNBLOCK_IO_AND_ALARM();
 	return 1;
 }
 #endif	/* !SIM */
 
 
 #if !defined(SIM) && defined(SIGDIE1)
 /*
  * finish - exit gracefully
  */
 static void
 finish_safe(
 	int	sig
 	)
 {
 	const char *sig_desc;
 
 	sig_desc = NULL;
 #ifdef HAVE_STRSIGNAL
 	sig_desc = strsignal(sig);
 #endif
 	if (sig_desc == NULL)
 		sig_desc = "";
 	msyslog(LOG_NOTICE, "%s exiting on signal %d (%s)", progname,
 		sig, sig_desc);
 	/* See Bug 2513 and Bug 2522 re the unlink of PIDFILE */
 # ifdef HAVE_DNSREGISTRATION
 	if (mdns != NULL)
 		DNSServiceRefDeallocate(mdns);
 # endif
 	peer_cleanup();
 	exit(0);
 }
 
 static RETSIGTYPE
 finish(
 	int	sig
 	)
 {
 	signalled = 1;
 	signo = sig;
 }
 
 #endif	/* !SIM && SIGDIE1 */
 
 
 #ifndef SIM
 /*
  * wait_child_sync_if - implements parent side of -w/--wait-sync
  */
 # ifdef HAVE_WORKING_FORK
 static int
 wait_child_sync_if(
 	int	pipe_read_fd,
 	long	wait_sync
 	)
 {
 	int	rc;
 	int	exit_code;
 	time_t	wait_end_time;
 	time_t	cur_time;
 	time_t	wait_rem;
 	fd_set	readset;
 	struct timeval wtimeout;
 
 	if (0 == wait_sync) 
 		return 0;
 
 	/* waitsync_fd_to_close used solely by child */
 	close(waitsync_fd_to_close);
 	wait_end_time = time(NULL) + wait_sync;
 	do {
 		cur_time = time(NULL);
 		wait_rem = (wait_end_time > cur_time)
 				? (wait_end_time - cur_time)
 				: 0;
 		wtimeout.tv_sec = wait_rem;
 		wtimeout.tv_usec = 0;
 		FD_ZERO(&readset);
 		FD_SET(pipe_read_fd, &readset);
 		rc = select(pipe_read_fd + 1, &readset, NULL, NULL,
 			    &wtimeout);
 		if (-1 == rc) {
 			if (EINTR == errno)
 				continue;
 			exit_code = (errno) ? errno : -1;
 			msyslog(LOG_ERR,
 				"--wait-sync select failed: %m");
 			return exit_code;
 		}
 		if (0 == rc) {
 			/*
 			 * select() indicated a timeout, but in case
 			 * its timeouts are affected by a step of the
 			 * system clock, select() again with a zero 
 			 * timeout to confirm.
 			 */
 			FD_ZERO(&readset);
 			FD_SET(pipe_read_fd, &readset);
 			wtimeout.tv_sec = 0;
 			wtimeout.tv_usec = 0;
 			rc = select(pipe_read_fd + 1, &readset, NULL,
 				    NULL, &wtimeout);
 			if (0 == rc)	/* select() timeout */
 				break;
 			else		/* readable */
 				return 0;
 		} else			/* readable */
 			return 0;
 	} while (wait_rem > 0);
 
 	fprintf(stderr, "%s: -w/--wait-sync %ld timed out.\n",
 		progname, wait_sync);
 	return ETIMEDOUT;
 }
 # endif	/* HAVE_WORKING_FORK */
 
 
 /*
  * assertion_failed - Redirect assertion failures to msyslog().
  */
 static void
 assertion_failed(
 	const char *file,
 	int line,
 	isc_assertiontype_t type,
 	const char *cond
 	)
 {
 	isc_assertion_setcallback(NULL);    /* Avoid recursion */
 
 	msyslog(LOG_ERR, "%s:%d: %s(%s) failed",
 		file, line, isc_assertion_typetotext(type), cond);
 	msyslog(LOG_ERR, "exiting (due to assertion failure)");
 
 #if defined(DEBUG) && defined(SYS_WINNT)
 	if (debug)
 		DebugBreak();
 #endif
 
 	abort();
 }
 
 
 /*
  * library_fatal_error - Handle fatal errors from our libraries.
  */
 static void
 library_fatal_error(
 	const char *file,
 	int line,
 	const char *format,
 	va_list args
 	)
 {
 	char errbuf[256];
 
 	isc_error_setfatal(NULL);  /* Avoid recursion */
 
 	msyslog(LOG_ERR, "%s:%d: fatal error:", file, line);
 	vsnprintf(errbuf, sizeof(errbuf), format, args);
 	msyslog(LOG_ERR, "%s", errbuf);
 	msyslog(LOG_ERR, "exiting (due to fatal error in library)");
 
 #if defined(DEBUG) && defined(SYS_WINNT)
 	if (debug)
 		DebugBreak();
 #endif
 
 	abort();
 }
 
 
 /*
  * library_unexpected_error - Handle non fatal errors from our libraries.
  */
 # define MAX_UNEXPECTED_ERRORS 100
 int unexpected_error_cnt = 0;
 static void
 library_unexpected_error(
 	const char *file,
 	int line,
 	const char *format,
 	va_list args
 	)
 {
 	char errbuf[256];
 
 	if (unexpected_error_cnt >= MAX_UNEXPECTED_ERRORS)
 		return;	/* avoid clutter in log */
 
 	msyslog(LOG_ERR, "%s:%d: unexpected error:", file, line);
 	vsnprintf(errbuf, sizeof(errbuf), format, args);
 	msyslog(LOG_ERR, "%s", errbuf);
 
 	if (++unexpected_error_cnt == MAX_UNEXPECTED_ERRORS)
 		msyslog(LOG_ERR, "Too many errors.  Shutting up.");
 
 }
 #endif	/* !SIM */
 
 #if !defined(SIM) && !defined(SYS_WINNT)
 # ifdef DEBUG
 
 /*
  * moredebug - increase debugging verbosity
  */
 static RETSIGTYPE
 moredebug(
 	int sig
 	)
 {
 	int saved_errno = errno;
 
 	if (debug < 255)
 	{
 		debug++;
 		msyslog(LOG_DEBUG, "debug raised to %d", debug);
 	}
 	errno = saved_errno;
 }
 
 
 /*
  * lessdebug - decrease debugging verbosity
  */
 static RETSIGTYPE
 lessdebug(
 	int sig
 	)
 {
 	int saved_errno = errno;
 
 	if (debug > 0)
 	{
 		debug--;
 		msyslog(LOG_DEBUG, "debug lowered to %d", debug);
 	}
 	errno = saved_errno;
 }
 
 # else	/* !DEBUG follows */
 
 
 /*
  * no_debug - We don't do the debug here.
  */
 static RETSIGTYPE
 no_debug(
 	int sig
 	)
 {
 	int saved_errno = errno;
 
 	msyslog(LOG_DEBUG, "ntpd not compiled for debugging (signal %d)", sig);
 	errno = saved_errno;
 }
 # endif	/* !DEBUG */
 #endif	/* !SIM && !SYS_WINNT */
Index: projects/clang900-import/contrib/ntp
===================================================================
--- projects/clang900-import/contrib/ntp	(revision 352586)
+++ projects/clang900-import/contrib/ntp	(revision 352587)

Property changes on: projects/clang900-import/contrib/ntp
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/contrib/ntp:r351317-352586
Index: projects/clang900-import/share/man/man7/ascii.7
===================================================================
--- projects/clang900-import/share/man/man7/ascii.7	(revision 352586)
+++ projects/clang900-import/share/man/man7/ascii.7	(revision 352587)
@@ -1,153 +1,160 @@
 .\" Copyright (c) 1989, 1990, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"	@(#)ascii.7	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd October 30, 2017
+.Dd September 21, 2019
 .Dt ASCII 7
 .Os
 .Sh NAME
 .Nm ascii
 .Nd octal, hexadecimal, decimal and binary
 .Tn ASCII
 character sets
 .Sh DESCRIPTION
 The
 .Nm octal
 set:
 .Bd -literal -offset left
 000 NUL  001 SOH  002 STX  003 ETX  004 EOT  005 ENQ  006 ACK  007 BEL
 010 BS   011 HT   012 LF   013 VT   014 FF   015 CR   016 SO   017 SI
 020 DLE  021 DC1  022 DC2  023 DC3  024 DC4  025 NAK  026 SYN  027 ETB
 030 CAN  031 EM   032 SUB  033 ESC  034 FS   035 GS   036 RS   037 US
 040 SP   041  !   042  "   043  #   044  $   045  %   046  &   047  '
 050  (   051  )   052  *   053  +   054  ,   055  -   056  .   057  /
 060  0   061  1   062  2   063  3   064  4   065  5   066  6   067  7
 070  8   071  9   072  :   073  ;   074  <   075  =   076  >   077  ?
 100  @   101  A   102  B   103  C   104  D   105  E   106  F   107  G
 110  H   111  I   112  J   113  K   114  L   115  M   116  N   117  O
 120  P   121  Q   122  R   123  S   124  T   125  U   126  V   127  W
 130  X   131  Y   132  Z   133  [   134  \e\   135  ]   136  ^   137  _
 140  `   141  a   142  b   143  c   144  d   145  e   146  f   147  g
 150  h   151  i   152  j   153  k   154  l   155  m   156  n   157  o
 160  p   161  q   162  r   163  s   164  t   165  u   166  v   167  w
 170  x   171  y   172  z   173  {   174  |   175  }   176  ~   177 DEL
 .Ed
 .Pp
 The
 .Nm hexadecimal
 set:
 .Bd -literal -offset left
 00 NUL   01 SOH   02 STX   03 ETX   04 EOT   05 ENQ   06 ACK   07 BEL
 08 BS    09 HT    0a LF    0b VT    0c FF    0d CR    0e SO    0f SI
 10 DLE   11 DC1   12 DC2   13 DC3   14 DC4   15 NAK   16 SYN   17 ETB
 18 CAN   19 EM    1a SUB   1b ESC   1c FS    1d GS    1e RS    1f US
 20 SP    21  !    22  "    23  #    24  $    25  %    26  &    27  '
 28  (    29  )    2a  *    2b  +    2c  ,    2d  -    2e  .    2f  /
 30  0    31  1    32  2    33  3    34  4    35  5    36  6    37  7
 38  8    39  9    3a  :    3b  ;    3c  <    3d  =    3e  >    3f  ?
 40  @    41  A    42  B    43  C    44  D    45  E    46  F    47  G
 48  H    49  I    4a  J    4b  K    4c  L    4d  M    4e  N    4f  O
 50  P    51  Q    52  R    53  S    54  T    55  U    56  V    57  W
 58  X    59  Y    5a  Z    5b  [    5c  \e\    5d  ]    5e  ^    5f  _
 60  \`    61  a    62  b    63  c    64  d    65  e    66  f    67  g
 68  h    69  i    6a  j    6b  k    6c  l    6d  m    6e  n    6f  o
 70  p    71  q    72  r    73  s    74  t    75  u    76  v    77  w
 78  x    79  y    7a  z    7b  {    7c  |    7d  }    7e  ~    7f DEL
 .Ed
 .Pp
 The
 .Nm decimal
 set:
 .Bd -literal -offset left
   0 NUL    1 SOH    2 STX    3 ETX    4 EOT    5 ENQ    6 ACK    7 BEL
   8 BS     9 HT    10 LF    11 VT    12 FF    13 CR    14 SO    15 SI
  16 DLE   17 DC1   18 DC2   19 DC3   20 DC4   21 NAK   22 SYN   23 ETB
  24 CAN   25 EM    26 SUB   27 ESC   28 FS    29 GS    30 RS    31 US
  32 SP    33  !    34  "    35  #    36  $    37  %    38  &    39  '
  40  (    41  )    42  *    43  +    44  ,    45  -    46  .    47  /
  48  0    49  1    50  2    51  3    52  4    53  5    54  6    55  7
  56  8    57  9    58  :    59  ;    60  <    61  =    62  >    63  ?
  64  @    65  A    66  B    67  C    68  D    69  E    70  F    71  G
  72  H    73  I    74  J    75  K    76  L    77  M    78  N    79  O
  80  P    81  Q    82  R    83  S    84  T    85  U    86  V    87  W
  88  X    89  Y    90  Z    91  [    92  \e\    93  ]    94  ^    95  _
  96  `    97  a    98  b    99  c   100  d   101  e   102  f   103  g
 104  h   105  i   106  j   107  k   108  l   109  m   110  n   111  o
 112  p   113  q   114  r   115  s   116  t   117  u   118  v   119  w
 120  x   121  y   122  z   123  {   124  |   125  }   126  ~   127 DEL
 .Ed
 .Pp
 The
 .Nm binary
 set:
 .Bd -literal -offset left
  00     01     10     11
 
 NUL     SP      @      `     00000
 SOH      !      A      a     00001
 STX      "      B      b     00010
 ETX      #      C      c     00011
 EOT      $      D      d     00100
 ENQ      %      E      e     00101
 ACK      &      F      f     00110
 BEL      '      G      g     00111
  BS      (      H      h     01000
  HT      )      I      i     01001
  LF      *      J      j     01010
  VT      +      K      k     01011
  FF      ,      L      l     01100
  CR      -      M      m     01101
  SO      .      N      n     01110
  SI      /      O      o     01111
 DLE      0      P      p     10000
 DC1      1      Q      q     10001
 DC2      2      R      r     10010
 DC3      3      S      s     10011
 DC4      4      T      t     10100
 NAK      5      U      u     10101
 SYN      6      V      v     10110
 ETB      7      W      w     10111
 CAN      8      X      x     11000
  EM      9      Y      y     11001
 SUB      :      Z      z     11010
 ESC      ;      [      {     11011
  FS      <      \e\      |     11100
  GS      =      ]      }     11101
  RS      >      ^      -     11110
  US      ?      _    DEL     11111
 .Ed
 .Sh FILES
 .Bl -tag -width /usr/share/misc/ascii -compact
 .It Pa /usr/share/misc/ascii
 .El
+.Sh STANDARDS
+.Rs
+.%T Information Systems - Coded Character Sets - 7-Bit American National\
+ Standard Code for Information Interchange (7-Bit ASCII)
+.%R INCITS 4-1986[R2017]
+.%Q InterNational Committee for Information Technology Standards
+.Re
 .Sh HISTORY
 An
 .Nm
 manual page appeared in
-.At v7 .
+.At v2 .
Index: projects/clang900-import/stand/forth/color.4th
===================================================================
--- projects/clang900-import/stand/forth/color.4th	(revision 352586)
+++ projects/clang900-import/stand/forth/color.4th	(revision 352587)
@@ -1,49 +1,55 @@
 \ Copyright (c) 2011-2013 Devin Teske <dteske@FreeBSD.org>
 \ All rights reserved.
 \ 
 \ Redistribution and use in source and binary forms, with or without
 \ modification, are permitted provided that the following conditions
 \ are met:
 \ 1. Redistributions of source code must retain the above copyright
 \    notice, this list of conditions and the following disclaimer.
 \ 2. Redistributions in binary form must reproduce the above copyright
 \    notice, this list of conditions and the following disclaimer in the
 \    documentation and/or other materials provided with the distribution.
 \ 
 \ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 \ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 \ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 \ ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 \ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 \ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 \ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 \ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 \ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 \ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 \ SUCH DAMAGE.
 \ 
 \ $FreeBSD$
 
 marker task-color.4th
 
 \ This function returns FALSE if the `loader_color' environment variable is set
-\ to NO, no, or 0. Otherwise, TRUE is returned (unless booting serial).
+\ to NO, no, or 0. It returns TRUE if `loader_color' is set to any other value.
+\ If `loader_color' is unset, TRUE is returned (unless booting serial).
 \ 
-: loader_color? ( -- N )
+: loader_color? ( -- t )
 
 	s" loader_color" getenv dup -1 <> if
-
+		\ `loader_color' is set.
+		\ Check if it is explicitly disabled.
 		2dup s" NO" compare-insensitive 0= if
 			2drop
 			FALSE exit
 		then
 		2dup s" 0" compare 0= if
 			2drop
 			FALSE exit
 		then
 		drop
+		\ It is enabled.
+		TRUE
+	else
+		\ `loader_color' is unset.
+		\ Default to using color unless serial boot is active.
+		drop
+		boot_serial? 0=
 	then
-	drop
-
-	boot_serial? if FALSE else TRUE then
 ;
Index: projects/clang900-import/stand/libsa/zalloc.c
===================================================================
--- projects/clang900-import/stand/libsa/zalloc.c	(revision 352586)
+++ projects/clang900-import/stand/libsa/zalloc.c	(revision 352587)
@@ -1,338 +1,339 @@
 /*
  * This module derived from code donated to the FreeBSD Project by
  * Matthew Dillon <dillon@backplane.com>
  *
  * Copyright (c) 1998 The FreeBSD Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 /*
  * LIB/MEMORY/ZALLOC.C	- self contained low-overhead memory pool/allocation
  *			  subsystem
  *
  *	This subsystem implements memory pools and memory allocation
  *	routines.
  *
  *	Pools are managed via a linked list of 'free' areas.  Allocating
  *	memory creates holes in the freelist, freeing memory fills them.
  *	Since the freelist consists only of free memory areas, it is possible
  *	to allocate the entire pool without incuring any structural overhead.
  *
  *	The system works best when allocating similarly-sized chunks of
  *	memory.  Care must be taken to avoid fragmentation when
  *	allocating/deallocating dissimilar chunks.
  *
  *	When a memory pool is first allocated, the entire pool is marked as
  *	allocated.  This is done mainly because we do not want to modify any
  *	portion of a pool's data area until we are given permission.  The
  *	caller must explicitly deallocate portions of the pool to make them
  *	available.
  *
  *	z[n]xalloc() works like z[n]alloc() but the allocation is made from
  *	within the specified address range.  If the segment could not be
  *	allocated, NULL is returned.  WARNING!  The address range will be
  *	aligned to an 8 or 16 byte boundry depending on the cpu so if you
  *	give an unaligned address range, unexpected results may occur.
  *
  *	If a standard allocation fails, the reclaim function will be called
  *	to recover some space.  This usually causes other portions of the
  *	same pool to be released.  Memory allocations at this low level
  *	should not block but you can do that too in your reclaim function
  *	if you want.  Reclaim does not function when z[n]xalloc() is used,
  *	only for z[n]alloc().
  *
  *	Allocation and frees of 0 bytes are valid operations.
  */
 
 #include "zalloc_defs.h"
 
 /*
  * Objects in the pool must be aligned to at least the size of struct MemNode.
  * They must also be aligned to MALLOCALIGN, which should normally be larger
  * than the struct, so assert that to be so at compile time.
  */
 typedef char assert_align[(sizeof(struct MemNode) <= MALLOCALIGN) ? 1 : -1];
 
 #define	MEMNODE_SIZE_MASK	MALLOCALIGN_MASK
 
 /*
  * znalloc() -	allocate memory (without zeroing) from pool.  Call reclaim
  *		and retry if appropriate, return NULL if unable to allocate
  *		memory.
  */
 
 void *
 znalloc(MemPool *mp, uintptr_t bytes, size_t align)
 {
 	MemNode **pmn;
 	MemNode *mn;
 
 	/*
 	 * align according to pool object size (can be 0).  This is
 	 * inclusive of the MEMNODE_SIZE_MASK minimum alignment.
 	 *
 	*/
 	bytes = (bytes + MEMNODE_SIZE_MASK) & ~MEMNODE_SIZE_MASK;
 
 	if (bytes == 0)
 		return ((void *)-1);
 
 	/*
 	 * locate freelist entry big enough to hold the object.  If all objects
 	 * are the same size, this is a constant-time function.
 	 */
 
 	if (bytes > mp->mp_Size - mp->mp_Used)
 		return (NULL);
 
 	for (pmn = &mp->mp_First; (mn = *pmn) != NULL; pmn = &mn->mr_Next) {
 		char *ptr = (char *)mn;
 		uintptr_t dptr;
 		char *aligned;
 		size_t extra;
 
 		dptr = (uintptr_t)(ptr + MALLOCALIGN);  /* pointer to data */
 		aligned = (char *)(roundup2(dptr, align) - MALLOCALIGN);
 		extra = aligned - ptr;
 
 		if (bytes + extra > mn->mr_Bytes)
 			continue;
 
 		/*
-		 * Cut extra from head and create new memory node from reminder.
+		 * Cut extra from head and create new memory node from
+		 * remainder.
 		 */
 
 		if (extra != 0) {
 			MemNode *new;
 
 			new = (MemNode *)aligned;
 			new->mr_Next = mn->mr_Next;
 			new->mr_Bytes = mn->mr_Bytes - extra;
 
 			/* And update current memory node */
 			mn->mr_Bytes = extra;
 			mn->mr_Next = new;
 			/* In next iteration, we will get our aligned address */
 			continue;
 		}
 
 		/*
 		 *  Cut a chunk of memory out of the beginning of this
 		 *  block and fixup the link appropriately.
 		 */
 
 		if (mn->mr_Bytes == bytes) {
 			*pmn = mn->mr_Next;
 		} else {
 			mn = (MemNode *)((char *)mn + bytes);
 			mn->mr_Next  = ((MemNode *)ptr)->mr_Next;
 			mn->mr_Bytes = ((MemNode *)ptr)->mr_Bytes - bytes;
 			*pmn = mn;
 		}
 		mp->mp_Used += bytes;
 		return(ptr);
 	}
 
 	/*
 	 * Memory pool is full, return NULL.
 	 */
 
 	return (NULL);
 }
 
 /*
  * zfree() - free previously allocated memory
  */
 
 void
 zfree(MemPool *mp, void *ptr, uintptr_t bytes)
 {
 	MemNode **pmn;
 	MemNode *mn;
 
 	/*
 	 * align according to pool object size (can be 0).  This is
 	 * inclusive of the MEMNODE_SIZE_MASK minimum alignment.
 	 */
 	bytes = (bytes + MEMNODE_SIZE_MASK) & ~MEMNODE_SIZE_MASK;
 
 	if (bytes == 0)
 		return;
 
 	/*
 	 * panic if illegal pointer
 	 */
 
 	if ((char *)ptr < (char *)mp->mp_Base ||
 	    (char *)ptr + bytes > (char *)mp->mp_End ||
 	    ((uintptr_t)ptr & MEMNODE_SIZE_MASK) != 0)
 		panic("zfree(%p,%ju): wild pointer", ptr, (uintmax_t)bytes);
 
 	/*
 	 * free the segment
 	 */
 	mp->mp_Used -= bytes;
 
 	for (pmn = &mp->mp_First; (mn = *pmn) != NULL; pmn = &mn->mr_Next) {
 		/*
 		 * If area between last node and current node
 		 *  - check range
 		 *  - check merge with next area
 		 *  - check merge with previous area
 		 */
 		if ((char *)ptr <= (char *)mn) {
 			/*
 			 * range check
 			 */
 			if ((char *)ptr + bytes > (char *)mn) {
 				panic("zfree(%p,%ju): corrupt memlist1", ptr,
 				    (uintmax_t)bytes);
 			}
 
 			/*
 			 * merge against next area or create independant area
 			 */
 
 			if ((char *)ptr + bytes == (char *)mn) {
 				((MemNode *)ptr)->mr_Next = mn->mr_Next;
 				((MemNode *)ptr)->mr_Bytes =
 				    bytes + mn->mr_Bytes;
 			} else {
 				((MemNode *)ptr)->mr_Next = mn;
 				((MemNode *)ptr)->mr_Bytes = bytes;
 			}
 			*pmn = mn = (MemNode *)ptr;
 
 			/*
 			 * merge against previous area (if there is a previous
 			 * area).
 			 */
 
 			if (pmn != &mp->mp_First) {
 				if ((char *)pmn + ((MemNode*)pmn)->mr_Bytes ==
 				    (char *)ptr) {
 					((MemNode *)pmn)->mr_Next = mn->mr_Next;
 					((MemNode *)pmn)->mr_Bytes +=
 					    mn->mr_Bytes;
 					mn = (MemNode *)pmn;
 				}
 			}
 			return;
 		}
 		if ((char *)ptr < (char *)mn + mn->mr_Bytes) {
 			panic("zfree(%p,%ju): corrupt memlist2", ptr,
 			    (uintmax_t)bytes);
 		}
 	}
 	/*
 	 * We are beyond the last MemNode, append new MemNode.  Merge against
 	 * previous area if possible.
 	 */
 	if (pmn == &mp->mp_First ||
 	    (char *)pmn + ((MemNode *)pmn)->mr_Bytes != (char *)ptr) {
 		((MemNode *)ptr)->mr_Next = NULL;
 		((MemNode *)ptr)->mr_Bytes = bytes;
 		*pmn = (MemNode *)ptr;
 		mn = (MemNode *)ptr;
 	} else {
 		((MemNode *)pmn)->mr_Bytes += bytes;
 		mn = (MemNode *)pmn;
 	}
 }
 
 /*
  * zextendPool() - extend memory pool to cover additional space.
  *
  *		   Note: the added memory starts out as allocated, you
  *		   must free it to make it available to the memory subsystem.
  *
  *		   Note: mp_Size may not reflect (mp_End - mp_Base) range
  *		   due to other parts of the system doing their own sbrk()
  *		   calls.
  */
 
 void
 zextendPool(MemPool *mp, void *base, uintptr_t bytes)
 {
 	if (mp->mp_Size == 0) {
 		mp->mp_Base = base;
 		mp->mp_Used = bytes;
 		mp->mp_End = (char *)base + bytes;
 		mp->mp_Size = bytes;
 	} else {
 		void *pend = (char *)mp->mp_Base + mp->mp_Size;
 
 		if (base < mp->mp_Base) {
 			mp->mp_Size += (char *)mp->mp_Base - (char *)base;
 			mp->mp_Used += (char *)mp->mp_Base - (char *)base;
 			mp->mp_Base = base;
 		}
 		base = (char *)base + bytes;
 		if (base > pend) {
 			mp->mp_Size += (char *)base - (char *)pend;
 			mp->mp_Used += (char *)base - (char *)pend;
 			mp->mp_End = (char *)base;
 		}
 	}
 }
 
 #ifdef ZALLOCDEBUG
 
 void
 zallocstats(MemPool *mp)
 {
 	int abytes = 0;
 	int hbytes = 0;
 	int fcount = 0;
 	MemNode *mn;
 
 	printf("%d bytes reserved", (int)mp->mp_Size);
 
 	mn = mp->mp_First;
 
 	if ((void *)mn != (void *)mp->mp_Base) {
 		abytes += (char *)mn - (char *)mp->mp_Base;
 	}
 
 	while (mn != NULL) {
 		if ((char *)mn + mn->mr_Bytes != mp->mp_End) {
 			hbytes += mn->mr_Bytes;
 			++fcount;
 		}
 		if (mn->mr_Next != NULL) {
 			abytes += (char *)mn->mr_Next -
 			    ((char *)mn + mn->mr_Bytes);
 		}
 		mn = mn->mr_Next;
 	}
 	printf(" %d bytes allocated\n%d fragments (%d bytes fragmented)\n",
 	    abytes, fcount, hbytes);
 }
 
 #endif
Index: projects/clang900-import/stand/lua/color.lua
===================================================================
--- projects/clang900-import/stand/lua/color.lua	(revision 352586)
+++ projects/clang900-import/stand/lua/color.lua	(revision 352587)
@@ -1,118 +1,116 @@
 --
 -- SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 --
 -- Copyright (c) 2015 Pedro Souza <pedrosouza@freebsd.org>
 -- All rights reserved.
 --
 -- Redistribution and use in source and binary forms, with or without
 -- modification, are permitted provided that the following conditions
 -- are met:
 -- 1. Redistributions of source code must retain the above copyright
 --    notice, this list of conditions and the following disclaimer.
 -- 2. Redistributions in binary form must reproduce the above copyright
 --    notice, this list of conditions and the following disclaimer in the
 --    documentation and/or other materials provided with the distribution.
 --
 -- THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 -- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 -- ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 -- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 -- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 -- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 -- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 -- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 -- SUCH DAMAGE.
 --
 -- $FreeBSD$
 --
 
 local core = require("core")
 
 local color = {}
 
 -- Module exports
 color.BLACK   = 0
 color.RED     = 1
 color.GREEN   = 2
 color.YELLOW  = 3
 color.BLUE    = 4
 color.MAGENTA = 5
 color.CYAN    = 6
 color.WHITE   = 7
 
 color.DEFAULT = 0
 color.BRIGHT  = 1
 color.DIM     = 2
 
 function color.isEnabled()
 	local c = loader.getenv("loader_color")
 	if c ~= nil then
-		if c:lower() == "no" or c == "0" then
-			return false
-		end
+		return c:lower() ~= "no" and c ~= "0"
 	end
 	return not core.isSerialBoot()
 end
 
 color.disabled = not color.isEnabled()
 
 function color.escapefg(color_value)
 	if color.disabled then
 		return color_value
 	end
 	return core.KEYSTR_CSI .. "3" .. color_value .. "m"
 end
 
 function color.resetfg()
 	if color.disabled then
 		return ''
 	end
 	return color.escapefg(color.WHITE)
 end
 
 function color.escapebg(color_value)
 	if color.disabled then
 		return color_value
 	end
 	return core.KEYSTR_CSI .. "4" .. color_value .. "m"
 end
 
 function color.resetbg()
 	if color.disabled then
 		return ''
 	end
 	return color.escapebg(color.BLACK)
 end
 
 function color.escape(fg_color, bg_color, attribute)
 	if color.disabled then
 		return ""
 	end
 	if attribute == nil then
 		attribute = ""
 	else
 		attribute = attribute .. ";"
 	end
 	return core.KEYSTR_CSI .. attribute ..
 	    "3" .. fg_color .. ";4" .. bg_color .. "m"
 end
 
 function color.default()
 	if color.disabled then
 		return ""
 	end
 	return color.escape(color.WHITE, color.BLACK, color.DEFAULT)
 end
 
 function color.highlight(str)
 	if color.disabled then
 		return str
 	end
 	-- We need to reset attributes as well as color scheme here, just in
 	-- case the terminal defaults don't match what we're expecting.
 	return core.KEYSTR_CSI .. "1m" .. str .. core.KEYSTR_CSI .. "22m"
 end
 
 return color
Index: projects/clang900-import/stand/powerpc/uboot/Makefile
===================================================================
--- projects/clang900-import/stand/powerpc/uboot/Makefile	(revision 352586)
+++ projects/clang900-import/stand/powerpc/uboot/Makefile	(revision 352587)
@@ -1,34 +1,35 @@
 # $FreeBSD$
 
 LOADER_UFS_SUPPORT?=	yes
 LOADER_CD9660_SUPPORT?=	no
 LOADER_EXT2FS_SUPPORT?=	no
 LOADER_NET_SUPPORT?=	yes
 LOADER_NFS_SUPPORT?=	yes
 LOADER_TFTP_SUPPORT?=	no
 LOADER_GZIP_SUPPORT?=	no
 LOADER_BZIP2_SUPPORT?=	no
 
 .include <bsd.init.mk>
 
 BINDIR= 	/boot/uboot
 PROG=		ubldr
+STRIP=
 NEWVERSWHAT=	"U-Boot loader" ${MACHINE_ARCH}
 INSTALLFLAGS=	-b
 
 # Architecture-specific loader code
 SRCS=		start.S conf.c vers.c ppc64_elf_freebsd.c
 SRCS+=		ucmpdi2.c
 
 # Always add MI sources
 .include	"${BOOTSRC}/loader.mk"
 .PATH:		${SYSDIR}/libkern
 
 LDFLAGS=	-nostdlib -static -T ${.CURDIR}/ldscript.powerpc
 
 .include	"${BOOTSRC}/uboot.mk"
 
 DPADD=		${LDR_INTERP} ${LIBUBOOT} ${LIBFDT} ${LIBUBOOT_FDT} ${LIBSA}
 LDADD=		${LDR_INTERP} ${LIBUBOOT} ${LIBFDT} ${LIBUBOOT_FDT} ${LIBSA}
 
 .include <bsd.prog.mk>
Index: projects/clang900-import/sys/amd64/amd64/pmap.c
===================================================================
--- projects/clang900-import/sys/amd64/amd64/pmap.c	(revision 352586)
+++ projects/clang900-import/sys/amd64/amd64/pmap.c	(revision 352587)
@@ -1,10306 +1,10306 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * Copyright (c) 2014-2019 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #define	AMD64_NPT_AWARE
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_ddb.h"
 #include "opt_pmap.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangeset.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #ifdef DDB
 #include <sys/kdb.h>
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/ifunc.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/sysarch.h>
 #include <machine/tss.h>
 
 static __inline boolean_t
 pmap_type_guest(pmap_t pmap)
 {
 
 	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
 }
 
 static __inline boolean_t
 pmap_emulate_ad_bits(pmap_t pmap)
 {
 
 	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
 }
 
 static __inline pt_entry_t
 pmap_valid_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_V;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_EMUL_V;
 		else
 			mask = EPT_PG_READ;
 		break;
 	default:
 		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_rw_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_RW;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_EMUL_RW;
 		else
 			mask = EPT_PG_WRITE;
 		break;
 	default:
 		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static pt_entry_t pg_g;
 
 static __inline pt_entry_t
 pmap_global_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 		mask = pg_g;
 		break;
 	case PT_RVI:
 	case PT_EPT:
 		mask = 0;
 		break;
 	default:
 		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_accessed_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_A;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_READ;
 		else
 			mask = EPT_PG_A;
 		break;
 	default:
 		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_modified_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_M;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_WRITE;
 		else
 			mask = EPT_PG_M;
 		break;
 	default:
 		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_pku_mask_bit(pmap_t pmap)
 {
 
 	return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
 }
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 int nkpt;
 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
     "Number of kernel page table pages allocated on bootup");
 
 static int ndmpdp;
 vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 pt_entry_t pg_nx;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int pg_ps_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 
 #define	PAT_INDEX_SIZE	8
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
 
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
 static vm_paddr_t	KERNend;	/* phys addr of end of bootstrap data */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
  */
 #define	PMAP_PREINIT_MAPPING_COUNT	8
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	sz;
 	int		mode;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 static int pmap_initialized;
 
 /*
  * Data for the pv entry allocation mechanism.
  * Updates to pv_invl_gen are protected by the pv_list_locks[]
  * elements, but reads are not.
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx __exclusive_cache_line pv_chunks_mutex;
 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
 static u_long pv_invl_gen[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = NULL;
 caddr_t CADDR1 = 0;
 static vm_offset_t qframe = 0;
 static struct mtx qframe_mtx;
 
 static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
 
 static vmem_t *large_vmem;
 static u_int lm_ents;
 #define	PMAP_ADDRESS_IN_LARGEMAP(va)	((va) >= LARGEMAP_MIN_ADDRESS && \
 	(va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
 
 int pmap_pcid_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
 int invpcid_works = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
     "Is the invpcid instruction available ?");
 
 int __read_frequently pti = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pti, 0,
     "Page Table Isolation enabled");
 static vm_object_t pti_obj;
 static pml4_entry_t *pti_pml4;
 static vm_pindex_t pti_pg_idx;
 static bool pti_finalized;
 
 struct pmap_pkru_range {
 	struct rs_el	pkru_rs_el;
 	u_int		pkru_keyidx;
 	int		pkru_flags;
 };
 
 static uma_zone_t pmap_pkru_ranges_zone;
 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static void *pkru_dup_range(void *ctx, void *data);
 static void pkru_free_range(void *ctx, void *node);
 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static void pmap_pkru_deassign_all(pmap_t pmap);
 
 static int
 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	uint64_t res;
 
 	res = 0;
 	CPU_FOREACH(i) {
 		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
 	}
 	return (sysctl_handle_64(oidp, &res, 0, req));
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
     "Count of saved TLB context on switch");
 
 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
 static struct mtx invl_gen_mtx;
 /* Fake lock object to satisfy turnstiles interface. */
 static struct lock_object invl_gen_ts = {
 	.lo_name = "invlts",
 };
 static struct pmap_invl_gen pmap_invl_gen_head = {
 	.gen = 1,
 	.next = NULL,
 };
 static u_long pmap_invl_gen = 1;
 static int pmap_invl_waiters;
 static struct callout pmap_invl_callout;
 static bool pmap_invl_callout_inited;
 
 #define	PMAP_ASSERT_NOT_IN_DI() \
     KASSERT(pmap_not_in_di(), ("DI already started"))
 
 static bool
 pmap_di_locked(void)
 {
 	int tun;
 
 	if ((cpu_feature2 & CPUID2_CX16) == 0)
 		return (true);
 	tun = 0;
 	TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun);
 	return (tun != 0);
 }
 
 static int
 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)
 {
 	int locked;
 
 	locked = pmap_di_locked();
 	return (sysctl_handle_int(oidp, &locked, 0, req));
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN |
     CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "",
     "Locked delayed invalidation");
 
 static bool pmap_not_in_di_l(void);
 static bool pmap_not_in_di_u(void);
 DEFINE_IFUNC(, bool, pmap_not_in_di, (void))
 {
 
 	return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u);
 }
 
 static bool
 pmap_not_in_di_l(void)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	return (invl_gen->gen == 0);
 }
 
 static void
 pmap_thread_init_invl_gen_l(struct thread *td)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &td->td_md.md_invl_gen;
 	invl_gen->gen = 0;
 }
 
 static void
 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen)
 {
 	struct turnstile *ts;
 
 	ts = turnstile_trywait(&invl_gen_ts);
 	if (*m_gen > atomic_load_long(invl_gen))
 		turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
 	else
 		turnstile_cancel(ts);
 }
 
 static void
 pmap_delayed_invl_finish_unblock(u_long new_gen)
 {
 	struct turnstile *ts;
 
 	turnstile_chain_lock(&invl_gen_ts);
 	ts = turnstile_lookup(&invl_gen_ts);
 	if (new_gen != 0)
 		pmap_invl_gen = new_gen;
 	if (ts != NULL) {
 		turnstile_broadcast(ts, TS_SHARED_QUEUE);
 		turnstile_unpend(ts);
 	}
 	turnstile_chain_unlock(&invl_gen_ts);
 }
 
 /*
  * Start a new Delayed Invalidation (DI) block of code, executed by
  * the current thread.  Within a DI block, the current thread may
  * destroy both the page table and PV list entries for a mapping and
  * then release the corresponding PV list lock before ensuring that
  * the mapping is flushed from the TLBs of any processors with the
  * pmap active.
  */
 static void
 pmap_delayed_invl_start_l(void)
 {
 	struct pmap_invl_gen *invl_gen;
 	u_long currgen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	PMAP_ASSERT_NOT_IN_DI();
 	mtx_lock(&invl_gen_mtx);
 	if (LIST_EMPTY(&pmap_invl_gen_tracker))
 		currgen = pmap_invl_gen;
 	else
 		currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
 	invl_gen->gen = currgen + 1;
 	LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
 	mtx_unlock(&invl_gen_mtx);
 }
 
 /*
  * Finish the DI block, previously started by the current thread.  All
  * required TLB flushes for the pages marked by
  * pmap_delayed_invl_page() must be finished before this function is
  * called.
  *
  * This function works by bumping the global DI generation number to
  * the generation number of the current thread's DI, unless there is a
  * pending DI that started earlier.  In the latter case, bumping the
  * global DI generation number would incorrectly signal that the
  * earlier DI had finished.  Instead, this function bumps the earlier
  * DI's generation number to match the generation number of the
  * current thread's DI.
  */
 static void
 pmap_delayed_invl_finish_l(void)
 {
 	struct pmap_invl_gen *invl_gen, *next;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	KASSERT(invl_gen->gen != 0, ("missed invl_start"));
 	mtx_lock(&invl_gen_mtx);
 	next = LIST_NEXT(invl_gen, link);
 	if (next == NULL)
 		pmap_delayed_invl_finish_unblock(invl_gen->gen);
 	else
 		next->gen = invl_gen->gen;
 	LIST_REMOVE(invl_gen, link);
 	mtx_unlock(&invl_gen_mtx);
 	invl_gen->gen = 0;
 }
 
 static bool
 pmap_not_in_di_u(void)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
 }
 
 static void
 pmap_thread_init_invl_gen_u(struct thread *td)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &td->td_md.md_invl_gen;
 	invl_gen->gen = 0;
 	invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
 }
 
 static bool
 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
 {
 	uint64_t new_high, new_low, old_high, old_low;
 	char res;
 
 	old_low = new_low = 0;
 	old_high = new_high = (uintptr_t)0;
 
 	__asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
 	    : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
 	    : "b"(new_low), "c" (new_high)
 	    : "memory", "cc");
 	if (res == 0) {
 		if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
 			return (false);
 		out->gen = old_low;
 		out->next = (void *)old_high;
 	} else {
 		out->gen = new_low;
 		out->next = (void *)new_high;
 	}
 	return (true);
 }
 
 static bool
 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
     struct pmap_invl_gen *new_val)
 {
 	uint64_t new_high, new_low, old_high, old_low;
 	char res;
 
 	new_low = new_val->gen;
 	new_high = (uintptr_t)new_val->next;
 	old_low = old_val->gen;
 	old_high = (uintptr_t)old_val->next;
 
 	__asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
 	    : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
 	    : "b"(new_low), "c" (new_high)
 	    : "memory", "cc");
 	return (res);
 }
 
 #ifdef PV_STATS
 static long invl_start_restart;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
     &invl_start_restart, 0,
     "");
 static long invl_finish_restart;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
     &invl_finish_restart, 0,
     "");
 static int invl_max_qlen;
 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
     &invl_max_qlen, 0,
     "");
 #endif
 
 static struct lock_delay_config __read_frequently di_delay;
 LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
 
 static void
 pmap_delayed_invl_start_u(void)
 {
 	struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
 	struct thread *td;
 	struct lock_delay_arg lda;
 	uintptr_t prevl;
 	u_char pri;
 #ifdef PV_STATS
 	int i, ii;
 #endif
 
 	td = curthread;
 	invl_gen = &td->td_md.md_invl_gen;
 	PMAP_ASSERT_NOT_IN_DI();
 	lock_delay_arg_init(&lda, &di_delay);
 	invl_gen->saved_pri = 0;
 	pri = td->td_base_pri;
 	if (pri > PVM) {
 		thread_lock(td);
 		pri = td->td_base_pri;
 		if (pri > PVM) {
 			invl_gen->saved_pri = pri;
 			sched_prio(td, PVM);
 		}
 		thread_unlock(td);
 	}
 again:
 	PV_STAT(i = 0);
 	for (p = &pmap_invl_gen_head;; p = prev.next) {
 		PV_STAT(i++);
 		prevl = atomic_load_ptr(&p->next);
 		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
 			PV_STAT(atomic_add_long(&invl_start_restart, 1));
 			lock_delay(&lda);
 			goto again;
 		}
 		if (prevl == 0)
 			break;
 		prev.next = (void *)prevl;
 	}
 #ifdef PV_STATS
 	if ((ii = invl_max_qlen) < i)
 		atomic_cmpset_int(&invl_max_qlen, ii, i);
 #endif
 
 	if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
 		PV_STAT(atomic_add_long(&invl_start_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	new_prev.gen = prev.gen;
 	new_prev.next = invl_gen;
 	invl_gen->gen = prev.gen + 1;
 
 	/* Formal fence between store to invl->gen and updating *p. */
 	atomic_thread_fence_rel();
 
 	/*
 	 * After inserting an invl_gen element with invalid bit set,
 	 * this thread blocks any other thread trying to enter the
 	 * delayed invalidation block.  Do not allow to remove us from
 	 * the CPU, because it causes starvation for other threads.
 	 */
 	critical_enter();
 
 	/*
 	 * ABA for *p is not possible there, since p->gen can only
 	 * increase.  So if the *p thread finished its di, then
 	 * started a new one and got inserted into the list at the
 	 * same place, its gen will appear greater than the previously
 	 * read gen.
 	 */
 	if (!pmap_di_store_invl(p, &prev, &new_prev)) {
 		critical_exit();
 		PV_STAT(atomic_add_long(&invl_start_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	/*
 	 * There we clear PMAP_INVL_GEN_NEXT_INVALID in
 	 * invl_gen->next, allowing other threads to iterate past us.
 	 * pmap_di_store_invl() provides fence between the generation
 	 * write and the update of next.
 	 */
 	invl_gen->next = NULL;
 	critical_exit();
 }
 
 static bool
 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen,
     struct pmap_invl_gen *p)
 {
 	struct pmap_invl_gen prev, new_prev;
 	u_long mygen;
 
 	/*
 	 * Load invl_gen->gen after setting invl_gen->next
 	 * PMAP_INVL_GEN_NEXT_INVALID.  This prevents larger
 	 * generations to propagate to our invl_gen->gen.  Lock prefix
 	 * in atomic_set_ptr() worked as seq_cst fence.
 	 */
 	mygen = atomic_load_long(&invl_gen->gen);
 
 	if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
 		return (false);
 
 	KASSERT(prev.gen < mygen,
 	    ("invalid di gen sequence %lu %lu", prev.gen, mygen));
 	new_prev.gen = mygen;
 	new_prev.next = (void *)((uintptr_t)invl_gen->next &
 	    ~PMAP_INVL_GEN_NEXT_INVALID);
 
 	/* Formal fence between load of prev and storing update to it. */
 	atomic_thread_fence_rel();
 
 	return (pmap_di_store_invl(p, &prev, &new_prev));
 }
 
 static void
 pmap_delayed_invl_finish_u(void)
 {
 	struct pmap_invl_gen *invl_gen, *p;
 	struct thread *td;
 	struct lock_delay_arg lda;
 	uintptr_t prevl;
 
 	td = curthread;
 	invl_gen = &td->td_md.md_invl_gen;
 	KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
 	KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
 	    ("missed invl_start: INVALID"));
 	lock_delay_arg_init(&lda, &di_delay);
 
 again:
 	for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
 		prevl = atomic_load_ptr(&p->next);
 		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
 			PV_STAT(atomic_add_long(&invl_finish_restart, 1));
 			lock_delay(&lda);
 			goto again;
 		}
 		if ((void *)prevl == invl_gen)
 			break;
 	}
 
 	/*
 	 * It is legitimate to not find ourself on the list if a
 	 * thread before us finished its DI and started it again.
 	 */
 	if (__predict_false(p == NULL)) {
 		PV_STAT(atomic_add_long(&invl_finish_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	critical_enter();
 	atomic_set_ptr((uintptr_t *)&invl_gen->next,
 	    PMAP_INVL_GEN_NEXT_INVALID);
 	if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) {
 		atomic_clear_ptr((uintptr_t *)&invl_gen->next,
 		    PMAP_INVL_GEN_NEXT_INVALID);
 		critical_exit();
 		PV_STAT(atomic_add_long(&invl_finish_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 	critical_exit();
 	if (atomic_load_int(&pmap_invl_waiters) > 0)
 		pmap_delayed_invl_finish_unblock(0);
 	if (invl_gen->saved_pri != 0) {
 		thread_lock(td);
 		sched_prio(td, invl_gen->saved_pri);
 		thread_unlock(td);
 	}
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(di_queue, pmap_di_queue)
 {
 	struct pmap_invl_gen *p, *pn;
 	struct thread *td;
 	uintptr_t nextl;
 	bool first;
 
 	for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
 	    first = false) {
 		nextl = atomic_load_ptr(&p->next);
 		pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
 		td = first ? NULL : __containerof(p, struct thread,
 		    td_md.md_invl_gen);
 		db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
 		    (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
 		    td != NULL ? td->td_tid : -1);
 	}
 }
 #endif
 
 #ifdef PV_STATS
 static long invl_wait;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
     "Number of times DI invalidation blocked pmap_remove_all/write");
 static long invl_wait_slow;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0,
     "Number of slow invalidation waits for lockless DI");
 #endif
 
 static u_long *
 pmap_delayed_invl_genp(vm_page_t m)
 {
 
 	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
 }
 
 static void
 pmap_delayed_invl_callout_func(void *arg __unused)
 {
 
 	if (atomic_load_int(&pmap_invl_waiters) == 0)
 		return;
 	pmap_delayed_invl_finish_unblock(0);
 }
 
 static void
 pmap_delayed_invl_callout_init(void *arg __unused)
 {
 
 	if (pmap_di_locked())
 		return;
 	callout_init(&pmap_invl_callout, 1);
 	pmap_invl_callout_inited = true;
 }
 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY,
     pmap_delayed_invl_callout_init, NULL);
 
 /*
  * Ensure that all currently executing DI blocks, that need to flush
  * TLB for the given page m, actually flushed the TLB at the time the
  * function returned.  If the page m has an empty PV list and we call
  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
  * valid mapping for the page m in either its page table or TLB.
  *
  * This function works by blocking until the global DI generation
  * number catches up with the generation number associated with the
  * given page m and its PV list.  Since this function's callers
  * typically own an object lock and sometimes own a page lock, it
  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
  * processor.
  */
 static void
 pmap_delayed_invl_wait_l(vm_page_t m)
 {
 	u_long *m_gen;
 #ifdef PV_STATS
 	bool accounted = false;
 #endif
 
 	m_gen = pmap_delayed_invl_genp(m);
 	while (*m_gen > pmap_invl_gen) {
 #ifdef PV_STATS
 		if (!accounted) {
 			atomic_add_long(&invl_wait, 1);
 			accounted = true;
 		}
 #endif
 		pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen);
 	}
 }
 
 static void
 pmap_delayed_invl_wait_u(vm_page_t m)
 {
 	u_long *m_gen;
 	struct lock_delay_arg lda;
 	bool fast;
 
 	fast = true;
 	m_gen = pmap_delayed_invl_genp(m);
 	lock_delay_arg_init(&lda, &di_delay);
 	while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
 		if (fast || !pmap_invl_callout_inited) {
 			PV_STAT(atomic_add_long(&invl_wait, 1));
 			lock_delay(&lda);
 			fast = false;
 		} else {
 			/*
 			 * The page's invalidation generation number
 			 * is still below the current thread's number.
 			 * Prepare to block so that we do not waste
 			 * CPU cycles or worse, suffer livelock.
 			 *
 			 * Since it is impossible to block without
 			 * racing with pmap_delayed_invl_finish_u(),
 			 * prepare for the race by incrementing
 			 * pmap_invl_waiters and arming a 1-tick
 			 * callout which will unblock us if we lose
 			 * the race.
 			 */
 			atomic_add_int(&pmap_invl_waiters, 1);
 
 			/*
 			 * Re-check the current thread's invalidation
 			 * generation after incrementing
 			 * pmap_invl_waiters, so that there is no race
 			 * with pmap_delayed_invl_finish_u() setting
 			 * the page generation and checking
 			 * pmap_invl_waiters.  The only race allowed
 			 * is for a missed unblock, which is handled
 			 * by the callout.
 			 */
 			if (*m_gen >
 			    atomic_load_long(&pmap_invl_gen_head.gen)) {
 				callout_reset(&pmap_invl_callout, 1,
 				    pmap_delayed_invl_callout_func, NULL);
 				PV_STAT(atomic_add_long(&invl_wait_slow, 1));
 				pmap_delayed_invl_wait_block(m_gen,
 				    &pmap_invl_gen_head.gen);
 			}
 			atomic_add_int(&pmap_invl_waiters, -1);
 		}
 	}
 }
 
 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *))
 {
 
 	return (pmap_di_locked() ? pmap_thread_init_invl_gen_l :
 	    pmap_thread_init_invl_gen_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_start_l :
 	    pmap_delayed_invl_start_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_finish_l :
 	    pmap_delayed_invl_finish_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_wait_l :
 	    pmap_delayed_invl_wait_u);
 }
 
 /*
  * Mark the page m's PV list as participating in the current thread's
  * DI block.  Any threads concurrently using m's PV list to remove or
  * restrict all mappings to m will wait for the current thread's DI
  * block to complete before proceeding.
  *
  * The function works by setting the DI generation number for m's PV
  * list to at least the DI generation number of the current thread.
  * This forces a caller of pmap_delayed_invl_wait() to block until
  * current thread calls pmap_delayed_invl_finish().
  */
 static void
 pmap_delayed_invl_page(vm_page_t m)
 {
 	u_long gen, *m_gen;
 
 	rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
 	gen = curthread->td_md.md_invl_gen.gen;
 	if (gen == 0)
 		return;
 	m_gen = pmap_delayed_invl_genp(m);
 	if (*m_gen < gen)
 		*m_gen = gen;
 }
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 /*
  * Internal flags for pmap_mapdev_internal() and
  * pmap_change_attr_locked().
  */
 #define	MAPDEV_FLUSHCACHE	0x0000001	/* Flush cache after mapping. */
 #define	MAPDEV_SETATTR		0x0000002	/* Modify existing attrs. */
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static int	popcnt_pc_map_pq(uint64_t *map);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	reserve_pv_entries(pmap_t pmap, int needed,
 		    struct rwlock **lockp);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 static bool	pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
 		    u_int flags, struct rwlock **lockp);
 #if VM_NRESERVLEVEL > 0
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 #endif
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode,
     int flags);
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
     vm_offset_t va, struct rwlock **lockp);
 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
     vm_offset_t va);
 static bool	pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
 		    vm_prot_t prot, struct rwlock **lockp);
 static int	pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
 		    u_int flags, vm_page_t m, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
     vm_offset_t eva);
 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
     vm_offset_t eva);
 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
 		    pd_entry_t pde);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static vm_page_t pmap_large_map_getptp_unlocked(void);
 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
 #if VM_NRESERVLEVEL > 0
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp);
 #endif
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
     bool exec);
 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
 static void pmap_pti_wire_pte(void *pte);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     struct spglist *free);
 static bool	pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 		    pd_entry_t *pde, struct spglist *free,
 		    struct rwlock **lockp);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
 		struct rwlock **lockp);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 /********************/
 /* Inline functions */
 /********************/
 
 /* Return a non-clipped PD index for a given VA */
 static __inline vm_pindex_t
 pmap_pde_pindex(vm_offset_t va)
 {
 	return (va >> PDRSHIFT);
 }
 
 
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 {
 
 	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 	return (&pdpe[pmap_pdpe_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pml4e = pmap_pml4e(pmap, va);
 	if ((*pml4e & PG_V) == 0)
 		return (NULL);
 	return (pmap_pml4e_to_pdpe(pml4e, va));
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pd_entry_t *pde;
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 	return (&pde[pmap_pde_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		return (NULL);
 	return (pmap_pdpe_to_pde(pdpe, va));
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 	return (&pte[pmap_pte_index(va)]);
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return (NULL);
 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
 		return ((pt_entry_t *)pde);
 	return (pmap_pde_to_pte(pde, va));
 }
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 PMAP_INLINE pt_entry_t *
 vtopte(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
 
 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
 }
 
 static __inline pd_entry_t *
 vtopde(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
 
 	return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
 static u_int64_t
 allocpages(vm_paddr_t *firstaddr, int n)
 {
 	u_int64_t ret;
 
 	ret = *firstaddr;
 	bzero((void *)ret, n * PAGE_SIZE);
 	*firstaddr += n * PAGE_SIZE;
 	return (ret);
 }
 
 CTASSERT(powerof2(NDMPML4E));
 
 /* number of kernel PDP slots */
 #define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
 
 static void
 nkpt_init(vm_paddr_t addr)
 {
 	int pt_pages;
 	
 #ifdef NKPT
 	pt_pages = NKPT;
 #else
 	pt_pages = howmany(addr, 1 << PDRSHIFT);
 	pt_pages += NKPDPE(pt_pages);
 
 	/*
 	 * Add some slop beyond the bare minimum required for bootstrapping
 	 * the kernel.
 	 *
 	 * This is quite important when allocating KVA for kernel modules.
 	 * The modules are required to be linked in the negative 2GB of
 	 * the address space.  If we run out of KVA in this region then
 	 * pmap_growkernel() will need to allocate page table pages to map
 	 * the entire 512GB of KVA space which is an unnecessary tax on
 	 * physical memory.
 	 *
 	 * Secondly, device memory mapped as part of setting up the low-
 	 * level console(s) is taken from KVA, starting at virtual_avail.
 	 * This is because cninit() is called after pmap_bootstrap() but
 	 * before vm_init() and pmap_init(). 20MB for a frame buffer is
 	 * not uncommon.
 	 */
 	pt_pages += 32;		/* 64MB additional slop. */
 #endif
 	nkpt = pt_pages;
 }
 
 /*
  * Returns the proper write/execute permission for a physical page that is
  * part of the initial boot allocations.
  *
  * If the page has kernel text, it is marked as read-only. If the page has
  * kernel read-only data, it is marked as read-only/not-executable. If the
  * page has only read-write data, it is marked as read-write/not-executable.
  * If the page is below/above the kernel range, it is marked as read-write.
  *
  * This function operates on 2M pages, since we map the kernel space that
  * way.
  *
  * Note that this doesn't currently provide any protection for modules.
  */
 static inline pt_entry_t
 bootaddr_rwx(vm_paddr_t pa)
 {
 
 	/*
 	 * Everything in the same 2M page as the start of the kernel
 	 * should be static. On the other hand, things in the same 2M
 	 * page as the end of the kernel could be read-write/executable,
 	 * as the kernel image is not guaranteed to end on a 2M boundary.
 	 */
 	if (pa < trunc_2mpage(btext - KERNBASE) ||
 	   pa >= trunc_2mpage(_end - KERNBASE))
 		return (X86_PG_RW);
 	/*
 	 * The linker should ensure that the read-only and read-write
 	 * portions don't share the same 2M page, so this shouldn't
 	 * impact read-only data. However, in any case, any page with
 	 * read-write data needs to be read-write.
 	 */
 	if (pa >= trunc_2mpage(brwsection - KERNBASE))
 		return (X86_PG_RW | pg_nx);
 	/*
 	 * Mark any 2M page containing kernel text as read-only. Mark
 	 * other pages with read-only data as read-only and not executable.
 	 * (It is likely a small portion of the read-only data section will
 	 * be marked as read-only, but executable. This should be acceptable
 	 * since the read-only protection will keep the data from changing.)
 	 * Note that fixups to the .text section will still work until we
 	 * set CR0.WP.
 	 */
 	if (pa < round_2mpage(etext - KERNBASE))
 		return (0);
 	return (pg_nx);
 }
 
 static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
 	int i, j, ndm1g, nkpdpe, nkdmpde;
 	pd_entry_t *pd_p;
 	pdp_entry_t *pdp_p;
 	pml4_entry_t *p4_p;
 	uint64_t DMPDkernphys;
 
 	/* Allocate page table pages for the direct map */
 	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
 	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
 	if (ndmpdpphys > NDMPML4E) {
 		/*
 		 * Each NDMPML4E allows 512 GB, so limit to that,
 		 * and then readjust ndmpdp and ndmpdpphys.
 		 */
 		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
 		Maxmem = atop(NDMPML4E * NBPML4);
 		ndmpdpphys = NDMPML4E;
 		ndmpdp = NDMPML4E * NPDEPG;
 	}
 	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 	ndm1g = 0;
 	if ((amd_feature & AMDID_PAGE1GB) != 0) {
 		/*
 		 * Calculate the number of 1G pages that will fully fit in
 		 * Maxmem.
 		 */
 		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
 
 		/*
 		 * Allocate 2M pages for the kernel. These will be used in
 		 * place of the first one or more 1G pages from ndm1g.
 		 */
 		nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
 		DMPDkernphys = allocpages(firstaddr, nkdmpde);
 	}
 	if (ndm1g < ndmpdp)
 		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
 	/* Allocate pages */
 	KPML4phys = allocpages(firstaddr, 1);
 	KPDPphys = allocpages(firstaddr, NKPML4E);
 
 	/*
 	 * Allocate the initial number of kernel page table pages required to
 	 * bootstrap.  We defer this until after all memory-size dependent
 	 * allocations are done (e.g. direct map), so that we don't have to
 	 * build in too much slop in our estimate.
 	 *
 	 * Note that when NKPML4E > 1, we have an empty page underneath
 	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
 	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 	 */
 	nkpt_init(*firstaddr);
 	nkpdpe = NKPDPE(nkpt);
 
 	KPTphys = allocpages(firstaddr, nkpt);
 	KPDphys = allocpages(firstaddr, nkpdpe);
 
 	/*
 	 * Connect the zero-filled PT pages to their PD entries.  This
 	 * implicitly maps the PT pages at their correct locations within
 	 * the PTmap.
 	 */
 	pd_p = (pd_entry_t *)KPDphys;
 	for (i = 0; i < nkpt; i++)
 		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * Map from physical address zero to the end of loader preallocated
 	 * memory using 2MB pages.  This replaces some of the PD entries
 	 * created above.
 	 */
 	for (i = 0; (i << PDRSHIFT) < KERNend; i++)
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
 
 	/*
 	 * Because we map the physical blocks in 2M pages, adjust firstaddr
 	 * to record the physical blocks we've actually mapped into kernel
 	 * virtual address space.
 	 */
 	if (*firstaddr < round_2mpage(KERNend))
 		*firstaddr = round_2mpage(KERNend);
 
 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
 	for (i = 0; i < nkpdpe; i++)
 		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
 	 * the end of physical memory is not aligned to a 1GB page boundary,
 	 * then the residual physical memory is mapped with 2MB pages.  Later,
 	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
 	 * that are partially used. 
 	 */
 	pd_p = (pd_entry_t *)DMPDphys;
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | pg_nx;
 	}
 	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | pg_nx;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
 		pdp_p[i] = DMPDphys + ptoa(j);
 		pdp_p[i] |= X86_PG_RW | X86_PG_V;
 	}
 
 	/*
 	 * Instead of using a 1G page for the memory containing the kernel,
 	 * use 2M pages with appropriate permissions. (If using 1G pages,
 	 * this will partially overwrite the PDPEs above.)
 	 */
 	if (ndm1g) {
 		pd_p = (pd_entry_t *)DMPDkernphys;
 		for (i = 0; i < (NPDEPG * nkdmpde); i++)
 			pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
 			    X86_PG_M | X86_PG_A | pg_nx |
 			    bootaddr_rwx(i << PDRSHIFT);
 		for (i = 0; i < nkdmpde; i++)
 			pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
 			    X86_PG_V;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
 	p4_p = (pml4_entry_t *)KPML4phys;
 	p4_p[PML4PML4I] = KPML4phys;
 	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
 
 	/* Connect the Direct Map slot(s) up to the PML4. */
 	for (i = 0; i < ndmpdpphys; i++) {
 		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
 		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V;
 	}
 
 	/* Connect the KVA slots up to the PML4 */
 	for (i = 0; i < NKPML4E; i++) {
 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
 		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
 	}
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On amd64 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t *firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *pcpu_pte;
 	uint64_t cr4, pcpu_phys;
 	u_long res;
 	int i;
 
 	KERNend = *firstaddr;
 	res = atop(KERNend - (vm_paddr_t)kernphys);
 
 	if (!pti)
 		pg_g = X86_PG_G;
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
 	create_pagetables(firstaddr);
 
 	pcpu_phys = allocpages(firstaddr, MAXCPU);
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated kernel page table pages so that vm_page structures
 	 * representing these pages will be created.  The vm_page structures
 	 * are required for promotion of the corresponding kernel virtual
 	 * addresses to superpage mappings.
 	 */
 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 
 	/*
 	 * Account for the virtual addresses mapped by create_pagetables().
 	 */
 	virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Enable PG_G global pages, then switch to the kernel page
 	 * table from the bootstrap page table.  After the switch, it
 	 * is possible to enable SMEP and SMAP since PG_U bits are
 	 * correct now.
 	 */
 	cr4 = rcr4();
 	cr4 |= CR4_PGE;
 	load_cr4(cr4);
 	load_cr3(KPML4phys);
 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 		cr4 |= CR4_SMEP;
 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
 		cr4 |= CR4_SMAP;
 	load_cr4(cr4);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 * Count bootstrap data as being resident in case any of this data is
 	 * later unmapped (using pmap_remove()) and freed.
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	kernel_pmap->pm_cr3 = KPML4phys;
 	kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	kernel_pmap->pm_stats.resident_count = res;
 	kernel_pmap->pm_flags = pmap_flags;
 
  	/*
 	 * Initialize the TLB invalidations generation number lock.
 	 */
 	mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * Crashdump maps.  The first page is reused as CMAP1 for the
 	 * memory test.
 	 */
 	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
 	CADDR1 = crashdumpmap;
 
 	SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
 	virtual_avail = va;
 
 	for (i = 0; i < MAXCPU; i++) {
 		pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW |
 		    pg_g | pg_nx | X86_PG_M | X86_PG_A;
 	}
 	STAILQ_INIT(&cpuhead);
 	wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
 	pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
 	amd64_bsp_pcpu_init1(&__pcpu[0]);
 	amd64_bsp_ist_init(&__pcpu[0]);
 	__pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
 	__pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
 
 	/*
 	 * Initialize the PAT MSR.
 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
 	 * side-effect, invalidates stale PG_G TLB entries that might
 	 * have been created in our pre-boot environment.
 	 */
 	pmap_init_pat();
 
 	/* Initialize TLB Context Id. */
 	if (pmap_pcid_enabled) {
 		for (i = 0; i < MAXCPU; i++) {
 			kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
 			kernel_pmap->pm_pcids[i].pm_gen = 1;
 		}
 
 		/*
 		 * PMAP_PCID_KERN + 1 is used for initialization of
 		 * proc0 pmap.  The pmap' pcid state might be used by
 		 * EFIRT entry before first context switch, so it
 		 * needs to be valid.
 		 */
 		PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
 		PCPU_SET(pcid_gen, 1);
 
 		/*
 		 * pcpu area for APs is zeroed during AP startup.
 		 * pc_pcid_next and pc_pcid_gen are initialized by AP
 		 * during pcpu setup.
 		 */
 		load_cr4(rcr4() | CR4_PCIDE);
 	}
 }
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	uint64_t pat_msr;
 	u_long cr0, cr4;
 	int i;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if ((cpu_feature & CPUID_PAT) == 0)
 		panic("no PAT??");
 
 	/* Set default PAT index table. */
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_index[i] = -1;
 	pat_index[PAT_WRITE_BACK] = 0;
 	pat_index[PAT_WRITE_THROUGH] = 1;
 	pat_index[PAT_UNCACHEABLE] = 3;
 	pat_index[PAT_WRITE_COMBINING] = 6;
 	pat_index[PAT_WRITE_PROTECTED] = 5;
 	pat_index[PAT_UNCACHED] = 2;
 
 	/*
 	 * Initialize default PAT entries.
 	 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 	 * Program 5 and 6 as WP and WC.
 	 *
 	 * Leave 4 and 7 as WB and UC.  Note that a recursive page table
 	 * mapping for a 2M page uses a PAT value with the bit 3 set due
 	 * to its overload with PG_PS.
 	 */
 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(2, PAT_UNCACHED) |
 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
 	    PAT_VALUE(4, PAT_WRITE_BACK) |
 	    PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 	    PAT_VALUE(6, PAT_WRITE_COMBINING) |
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	/* Disable PGE. */
 	cr4 = rcr4();
 	load_cr4(cr4 & ~CR4_PGE);
 
 	/* Disable caches (CD = 1, NW = 0). */
 	cr0 = rcr0();
 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 
 	/* Flushes caches and TLBs. */
 	wbinvd();
 	invltlb();
 
 	/* Update PAT and index table. */
 	wrmsr(MSR_PAT, pat_msr);
 
 	/* Flush caches and TLBs again. */
 	wbinvd();
 	invltlb();
 
 	/* Restore caches and PGE. */
 	load_cr0(cr0);
 	load_cr4(cr4);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pat_mode = PAT_WRITE_BACK;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t m, mpte;
 	vm_size_t s;
 	int error, i, pv_npg, ret, skz63;
 
 	/* L1TF, reserve page @0 unconditionally */
 	vm_page_blacklist_add(0, bootverbose);
 
 	/* Detect bare-metal Skylake Server and Skylake-X. */
 	if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
 		/*
 		 * Skylake-X errata SKZ63. Processor May Hang When
 		 * Executing Code In an HLE Transaction Region between
 		 * 40000000H and 403FFFFFH.
 		 *
 		 * Mark the pages in the range as preallocated.  It
 		 * seems to be impossible to distinguish between
 		 * Skylake Server and Skylake X.
 		 */
 		skz63 = 1;
 		TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
 		if (skz63 != 0) {
 			if (bootverbose)
 				printf("SKZ63: skipping 4M RAM starting "
 				    "at physical 1G\n");
 			for (i = 0; i < atop(0x400000); i++) {
 				ret = vm_page_blacklist_add(0x40000000 +
 				    ptoa(i), FALSE);
 				if (!ret && bootverbose)
 					printf("page at %#lx already used\n",
 					    0x40000000 + ptoa(i));
 			}
 		}
 	}
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
 	PMAP_LOCK(kernel_pmap);
 	for (i = 0; i < nkpt; i++) {
 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range"));
 		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 		mpte->wire_count = 1;
 
 		/*
 		 * Collect the page table pages that were replaced by a 2MB
 		 * page in create_pagetables().  They are zero filled.
 		 */
 		if (i << PDRSHIFT < KERNend &&
 		    pmap_insert_pt_page(kernel_pmap, mpte, false))
 			panic("pmap_init: pmap_insert_pt_page failed");
 	}
 	PMAP_UNLOCK(kernel_pmap);
 	vm_wire_add(nkpt);
 
 	/*
 	 * If the kernel is running on a virtual machine, then it must assume
 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 	 * be prepared for the hypervisor changing the vendor and family that
 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
 	 * 10h Erratum 383 is enabled if the processor's feature set does not
 	 * include at least one feature that is only supported by older Intel
 	 * or newer AMD processors.
 	 */
 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 	    AMDID2_FMA4)) == 0)
 		workaround_erratum383 = 1;
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 	if (pg_ps_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = NBPDR;
 	}
 
 	/*
 	 * Initialize the pv chunk list mutex.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	pmap_initialized = 1;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == 0)
 			continue;
 		/* Make the direct map consistent */
 		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
 			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
 			    ppim->sz, ppim->mode);
 		}
 		if (!bootverbose)
 			continue;
 		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
 		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
 	}
 
 	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 	    (vmem_addr_t *)&qframe);
 	if (error != 0)
 		panic("qframe allocation failed");
 
 	lm_ents = 8;
 	TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
 	if (lm_ents > LMEPML4I - LMSPML4I + 1)
 		lm_ents = LMEPML4I - LMSPML4I + 1;
 	if (bootverbose)
 		printf("pmap: large map %u PML4 slots (%lu Gb)\n",
 		    lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
 	if (lm_ents != 0) {
 		large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
 		    (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
 		if (large_vmem == NULL) {
 			printf("pmap: cannot create large map\n");
 			lm_ents = 0;
 		}
 		for (i = 0; i < lm_ents; i++) {
 			m = pmap_large_map_getptp_unlocked();
 			kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
 			    X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
 			    VM_PAGE_TO_PHYS(m);
 		}
 	}
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
     "2MB page mapping counters");
 
 static u_long pmap_pde_demotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pde_demotions, 0, "2MB page demotions");
 
 static u_long pmap_pde_mappings;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_pde_mappings, 0, "2MB page mappings");
 
 static u_long pmap_pde_p_failures;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_pde_p_failures, 0, "2MB page promotion failures");
 
 static u_long pmap_pde_promotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_pde_promotions, 0, "2MB page promotions");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
     "1GB page mapping counters");
 
 static u_long pmap_pdpe_demotions;
 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pdpe_demotions, 0, "1GB page demotions");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 static pt_entry_t
 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
 {
 	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* Verify that both PAT bits are not set at the same time */
 		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
 		    ("Invalid PAT bits in entry %#lx", entry));
 
 		/* Swap the PAT bits if one of them is set */
 		if ((entry & x86_pat_bits) != 0)
 			entry ^= x86_pat_bits;
 		break;
 	case PT_EPT:
 		/*
 		 * Nothing to do - the memory attributes are represented
 		 * the same way for regular pages and superpages.
 		 */
 		break;
 	default:
 		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
 	}
 
 	return (entry);
 }
 
 boolean_t
 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= 0 && mode < PAT_INDEX_SIZE &&
 	    pat_index[(int)mode] >= 0);
 }
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 int
 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
 	if (!pmap_is_valid_memattr(pmap, mode))
 		panic("Unknown caching mode %d\n", mode);
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* The PAT bit is different for PTE's and PDE's. */
 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 
 		/* Map the caching mode to a PAT index. */
 		pat_idx = pat_index[mode];
 
 		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 		cache_bits = 0;
 		if (pat_idx & 0x4)
 			cache_bits |= pat_flag;
 		if (pat_idx & 0x2)
 			cache_bits |= PG_NC_PCD;
 		if (pat_idx & 0x1)
 			cache_bits |= PG_NC_PWT;
 		break;
 
 	case PT_EPT:
 		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
 		break;
 
 	default:
 		panic("unsupported pmap type %d", pmap->pm_type);
 	}
 
 	return (cache_bits);
 }
 
 static int
 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 {
 	int mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
 		break;
 	case PT_EPT:
 		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
 		break;
 	default:
 		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static int
 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
 {
 	int pat_flag, pat_idx;
 
 	pat_idx = 0;
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* The PAT bit is different for PTE's and PDE's. */
 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 
 		if ((pte & pat_flag) != 0)
 			pat_idx |= 0x4;
 		if ((pte & PG_NC_PCD) != 0)
 			pat_idx |= 0x2;
 		if ((pte & PG_NC_PWT) != 0)
 			pat_idx |= 0x1;
 		break;
 	case PT_EPT:
 		if ((pte & EPT_PG_IGNORE_PAT) != 0)
 			panic("EPT PTE %#lx has no PAT memory type", pte);
 		pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
 		break;
 	}
 
 	/* See pmap_init_pat(). */
 	if (pat_idx == 4)
 		pat_idx = 0;
 	if (pat_idx == 7)
 		pat_idx = 3;
 
 	return (pat_idx);
 }
 
 bool
 pmap_ps_enabled(pmap_t pmap)
 {
 
 	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 }
 
 static void
 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 		break;
 	case PT_RVI:
 	case PT_EPT:
 		/*
 		 * XXX
 		 * This is a little bogus since the generation number is
 		 * supposed to be bumped up when a region of the address
 		 * space is invalidated in the page tables.
 		 *
 		 * In this case the old PDE entry is valid but yet we want
 		 * to make sure that any mappings using the old entry are
 		 * invalidated in the TLB.
 		 *
 		 * The reason this works as expected is because we rendezvous
 		 * "all" host cpus and force any vcpu context to exit as a
 		 * side-effect.
 		 */
 		atomic_add_acq_long(&pmap->pm_eptgen, 1);
 		break;
 	default:
 		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
 	}
 	pde_store(pde, newpde);
 }
 
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
  * calling processor's TLB is affected.
  *
  * The calling thread must be pinned to a processor.
  */
 static void
 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 {
 	pt_entry_t PG_G;
 
 	if (pmap_type_guest(pmap))
 		return;
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
 
 	PG_G = pmap_global_bit(pmap);
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
 		invlpg(va);
 	else if ((newpde & PG_G) == 0)
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB
 		 * because there are too many to flush individually.
 		 */
 		invltlb();
 	else {
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB,
 		 * including any global (PG_G) mappings.
 		 */
 		invltlb_glob();
 	}
 }
 #ifdef SMP
 
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  *
  * N.B.: Before calling any of the following TLB invalidation functions,
  * the calling processor must ensure that all stores updating a non-
  * kernel page table are globally performed.  Otherwise, another
  * processor could cache an old, pre-update entry without being
  * invalidated.  This can happen one of two ways: (1) The pmap becomes
  * active on another processor after its pm_active field is checked by
  * one of the following functions but before a store updating the page
  * table is globally performed. (2) The pmap becomes active on another
  * processor before its pm_active field is checked but due to
  * speculative loads one of the following functions stills reads the
  * pmap as inactive on the other processor.
  * 
  * The kernel page table is exempt because its pm_active field is
  * immutable.  The kernel page table is always active on every
  * processor.
  */
 
 /*
  * Interrupt the cpus that are executing in the guest context.
  * This will force the vcpu to exit and the cached EPT mappings
  * will be invalidated by the host before the next vmresume.
  */
 static __inline void
 pmap_invalidate_ept(pmap_t pmap)
 {
 	int ipinum;
 
 	sched_pin();
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("pmap_invalidate_ept: absurd pm_active"));
 
 	/*
 	 * The TLB mappings associated with a vcpu context are not
 	 * flushed each time a different vcpu is chosen to execute.
 	 *
 	 * This is in contrast with a process's vtop mappings that
 	 * are flushed from the TLB on each context switch.
 	 *
 	 * Therefore we need to do more than just a TLB shootdown on
 	 * the active cpus in 'pmap->pm_active'. To do this we keep
 	 * track of the number of invalidations performed on this pmap.
 	 *
 	 * Each vcpu keeps a cache of this counter and compares it
 	 * just before a vmresume. If the counter is out-of-date an
 	 * invept will be done to flush stale mappings from the TLB.
 	 */
 	atomic_add_acq_long(&pmap->pm_eptgen, 1);
 
 	/*
 	 * Force the vcpu to exit and trap back into the hypervisor.
 	 */
 	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
 	ipi_selected(pmap->pm_active, ipinum);
 	sched_unpin();
 }
 
 static cpuset_t
 pmap_invalidate_cpu_mask(pmap_t pmap)
 {
 
 	return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
 }
 
 static inline void
 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
     const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 	u_int cpuid, i;
 
 	cpuid = PCPU_GET(cpuid);
 	if (pmap == PCPU_GET(curpmap)) {
 		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 			/*
 			 * Because pm_pcid is recalculated on a
 			 * context switch, we must disable switching.
 			 * Otherwise, we might use a stale value
 			 * below.
 			 */
 			critical_enter();
 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
 			if (invpcid_works1) {
 				d.pcid = pcid | PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = va;
 				invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pcid |
 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 			}
 			critical_exit();
 		}
 	} else
 		pmap->pm_pcids[cpuid].pm_gen = 0;
 
 	CPU_FOREACH(i) {
 		if (cpuid != i)
 			pmap->pm_pcids[i].pm_gen = 0;
 	}
 
 	/*
 	 * The fence is between stores to pm_gen and the read of the
 	 * pm_active mask.  We need to ensure that it is impossible
 	 * for us to miss the bit update in pm_active and
 	 * simultaneously observe a non-zero pm_gen in
 	 * pmap_activate_sw(), otherwise TLB update is missed.
 	 * Without the fence, IA32 allows such an outcome.  Note that
 	 * pm_active is updated by a locked operation, which provides
 	 * the reciprocal fence.
 	 */
 	atomic_thread_fence_seq_cst();
 }
 
 static void
 pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va)
 {
 
 	pmap_invalidate_page_pcid(pmap, va, true);
 }
 
 static void
 pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va)
 {
 
 	pmap_invalidate_page_pcid(pmap, va, false);
 }
 
 static void
 pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t))
 {
 
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_page_pcid_invpcid :
 		    pmap_invalidate_page_pcid_noinvpcid);
 	return (pmap_invalidate_page_nopcid);
 }
 
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		invlpg(va);
 	} else {
 		if (pmap == PCPU_GET(curpmap))
 			invlpg(va);
 		pmap_invalidate_page_mode(pmap, va);
 	}
 	smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap);
 	sched_unpin();
 }
 
 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
 
 static void
 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 	u_int cpuid, i;
 
 	cpuid = PCPU_GET(cpuid);
 	if (pmap == PCPU_GET(curpmap)) {
 		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 			critical_enter();
 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
 			if (invpcid_works1) {
 				d.pcid = pcid | PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = sva;
 				for (; d.addr < eva; d.addr += PAGE_SIZE)
 					invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pcid |
 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 			}
 			critical_exit();
 		}
 	} else
 		pmap->pm_pcids[cpuid].pm_gen = 0;
 
 	CPU_FOREACH(i) {
 		if (cpuid != i)
 			pmap->pm_pcids[i].pm_gen = 0;
 	}
 	/* See the comment in pmap_invalidate_page_pcid(). */
 	atomic_thread_fence_seq_cst();
 }
 
 static void
 pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
 
 	pmap_invalidate_range_pcid(pmap, sva, eva, true);
 }
 
 static void
 pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
 
 	pmap_invalidate_range_pcid(pmap, sva, eva, false);
 }
 
 static void
 pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t,
     vm_offset_t))
 {
 
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_range_pcid_invpcid :
 		    pmap_invalidate_range_pcid_noinvpcid);
 	return (pmap_invalidate_range_nopcid);
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 		pmap_invalidate_all(pmap);
 		return;
 	}
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 	} else {
 		if (pmap == PCPU_GET(curpmap)) {
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		}
 		pmap_invalidate_range_mode(pmap, sva, eva);
 	}
 	smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap);
 	sched_unpin();
 }
 
 static inline void
 pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 	u_int cpuid, i;
 
 	if (pmap == kernel_pmap) {
 		if (invpcid_works1) {
 			bzero(&d, sizeof(d));
 			invpcid(&d, INVPCID_CTXGLOB);
 		} else {
 			invltlb_glob();
 		}
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		if (pmap == PCPU_GET(curpmap)) {
 			critical_enter();
 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
 			if (invpcid_works1) {
 				d.pcid = pcid;
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 					d.pcid |= PMAP_PCID_USER_PT;
 					invpcid(&d, INVPCID_CTX);
 				}
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid;
 				ucr3 = pmap->pm_ucr3;
 				if (ucr3 != PMAP_NO_CR3) {
 					ucr3 |= pcid | PMAP_PCID_USER_PT;
 					pmap_pti_pcid_invalidate(ucr3, kcr3);
 				} else {
 					load_cr3(kcr3);
 				}
 			}
 			critical_exit();
 		} else
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		CPU_FOREACH(i) {
 			if (cpuid != i)
 				pmap->pm_pcids[i].pm_gen = 0;
 		}
 	}
 	/* See the comment in pmap_invalidate_page_pcid(). */
 	atomic_thread_fence_seq_cst();
 }
 
 static void
 pmap_invalidate_all_pcid_invpcid(pmap_t pmap)
 {
 
 	pmap_invalidate_all_pcid(pmap, true);
 }
 
 static void
 pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap)
 {
 
 	pmap_invalidate_all_pcid(pmap, false);
 }
 
 static void
 pmap_invalidate_all_nopcid(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap)
 		invltlb_glob();
 	else if (pmap == PCPU_GET(curpmap))
 		invltlb();
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t))
 {
 
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_all_pcid_invpcid :
 		    pmap_invalidate_all_pcid_noinvpcid);
 	return (pmap_invalidate_all_nopcid);
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	pmap_invalidate_all_mode(pmap);
 	smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
 	pmap_t pmap;
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
 	u_int store;		/* processor that updates the PDE */
 };
 
 static void
 pmap_update_pde_action(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
 		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 }
 
 static void
 pmap_update_pde_teardown(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 }
 
 /*
  * Change the page size for the specified virtual address in a way that
  * prevents any possibility of the TLB ever having two entries that map the
  * same virtual address using different page sizes.  This is the recommended
  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  * machine check exception for a TLB state that is improperly diagnosed as a
  * hardware error.
  */
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 	struct pde_action act;
 	cpuset_t active, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
 	if (pmap == kernel_pmap || pmap_type_guest(pmap)) 
 		active = all_cpus;
 	else {
 		active = pmap->pm_active;
 	}
 	if (CPU_OVERLAP(&active, &other_cpus)) { 
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
 		act.pmap = pmap;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
 		smp_rendezvous_cpus(active,
 		    smp_no_rendezvous_barrier, pmap_update_pde_action,
 		    pmap_update_pde_teardown, &act);
 	} else {
 		pmap_update_pde_store(pmap, pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
 			pmap_update_pde_invalidate(pmap, va, newpde);
 	}
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, invalidation functions.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		invlpg(va);
 		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 		    pmap->pm_ucr3 != PMAP_NO_CR3) {
 			critical_enter();
 			pcid = pmap->pm_pcids[0].pm_pcid;
 			if (invpcid_works) {
 				d.pcid = pcid | PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = va;
 				invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pcid |
 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 			}
 			critical_exit();
 		}
 	} else if (pmap_pcid_enabled)
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct invpcid_descr d;
 	vm_offset_t addr;
 	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 		    pmap->pm_ucr3 != PMAP_NO_CR3) {
 			critical_enter();
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid |
 				    PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = sva;
 				for (; d.addr < eva; d.addr += PAGE_SIZE)
 					invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
 				    pm_pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
 				    pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 			}
 			critical_exit();
 		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap) {
 		if (pmap_pcid_enabled && invpcid_works) {
 			bzero(&d, sizeof(d));
 			invpcid(&d, INVPCID_CTXGLOB);
 		} else {
 			invltlb_glob();
 		}
 	} else if (pmap == PCPU_GET(curpmap)) {
 		if (pmap_pcid_enabled) {
 			critical_enter();
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid;
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 					d.pcid |= PMAP_PCID_USER_PT;
 					invpcid(&d, INVPCID_CTX);
 				}
 			} else {
 				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 					ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
 					    0].pm_pcid | PMAP_PCID_USER_PT;
 					pmap_pti_pcid_invalidate(ucr3, kcr3);
 				} else
 					load_cr3(kcr3);
 			}
 			critical_exit();
 		} else {
 			invltlb();
 		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	pmap_update_pde_store(pmap, pde, newpde);
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
 		pmap_update_pde_invalidate(pmap, va, newpde);
 	else
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 #endif /* !SMP */
 
 static void
 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 {
 
 	/*
 	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
 	 * by a promotion that did not invalidate the 512 4KB page mappings
 	 * that might exist in the TLB.  Consequently, at this point, the TLB
 	 * may hold both 4KB and 2MB page mappings for the address range [va,
 	 * va + NBPDR).  Therefore, the entire range must be invalidated here.
 	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
 	 * 4KB page mappings for the address range [va, va + NBPDR), and so a
 	 * single INVLPG suffices to invalidate the 2MB page mapping from the
 	 * TLB.
 	 */
 	if ((pde & PG_PROMOTED) != 0)
 		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
 	else
 		pmap_invalidate_page(pmap, va);
 }
 
 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
     (vm_offset_t sva, vm_offset_t eva))
 {
 
 	if ((cpu_feature & CPUID_SS) != 0)
 		return (pmap_invalidate_cache_range_selfsnoop);
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		return (pmap_force_invalidate_cache_range);
 	return (pmap_invalidate_cache_range_all);
 }
 
 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 
 static void
 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
 {
 
 	KASSERT((sva & PAGE_MASK) == 0,
 	    ("pmap_invalidate_cache_range: sva not page-aligned"));
 	KASSERT((eva & PAGE_MASK) == 0,
 	    ("pmap_invalidate_cache_range: eva not page-aligned"));
 }
 
 static void
 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 }
 
 void
 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 {
 
 	sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
 
 	/*
 	 * XXX: Some CPUs fault, hang, or trash the local APIC
 	 * registers if we use CLFLUSH on the local APIC range.  The
 	 * local APIC is always uncached, so we don't need to flush
 	 * for that range anyway.
 	 */
 	if (pmap_kextract(sva) == lapic_paddr)
 		return;
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
 		/*
 		 * Do per-cache line flush.  Use the sfence
 		 * instruction to insure that previous stores are
 		 * included in the write-back.  The processor
 		 * propagates flush to other processors in the cache
 		 * coherence domain.
 		 */
 		sfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflushopt(sva);
 		sfence();
 	} else {
 		/*
 		 * Writes are ordered by CLFLUSH on Intel CPUs.
 		 */
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflush(sva);
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	}
 }
 
 static void
 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 	pmap_invalidate_cache();
 }
 
 /*
  * Remove the specified set of pages from the data and instruction caches.
  *
  * In contrast to pmap_invalidate_cache_range(), this function does not
  * rely on the CPU's self-snoop feature, because it is intended for use
  * when moving pages into a different cache domain.
  */
 void
 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 {
 	vm_offset_t daddr, eva;
 	int i;
 	bool useclflushopt;
 
 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
 		pmap_invalidate_cache();
 	else {
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (i = 0; i < count; i++) {
 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 			eva = daddr + PAGE_SIZE;
 			for (; daddr < eva; daddr += cpu_clflush_line_size) {
 				if (useclflushopt)
 					clflushopt(daddr);
 				else
 					clflush(daddr);
 			}
 		}
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	}
 }
 
 void
 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
 		pmap_force_invalidate_cache_range(sva, eva);
 		return;
 	}
 
 	/* See comment in pmap_force_invalidate_cache_range(). */
 	if (pmap_kextract(sva) == lapic_paddr)
 		return;
 
 	sfence();
 	for (; sva < eva; sva += cpu_clflush_line_size)
 		clwb(sva);
 	sfence();
 }
 
 void
 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
 {
 	pt_entry_t *pte;
 	vm_offset_t vaddr;
 	int error, pte_bits;
 
 	KASSERT((spa & PAGE_MASK) == 0,
 	    ("pmap_flush_cache_phys_range: spa not page-aligned"));
 	KASSERT((epa & PAGE_MASK) == 0,
 	    ("pmap_flush_cache_phys_range: epa not page-aligned"));
 
 	if (spa < dmaplimit) {
 		pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
 		    dmaplimit, epa)));
 		if (dmaplimit >= epa)
 			return;
 		spa = dmaplimit;
 	}
 
 	pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
 	    X86_PG_V;
 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 	    &vaddr);
 	KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 	pte = vtopte(vaddr);
 	for (; spa < epa; spa += PAGE_SIZE) {
 		sched_pin();
 		pte_store(pte, spa | pte_bits);
 		invlpg(vaddr);
 		/* XXXKIB sfences inside flush_cache_range are excessive */
 		pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
 		sched_unpin();
 	}
 	vmem_free(kernel_arena, vaddr, PAGE_SIZE);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	vm_paddr_t pa;
 
 	pa = 0;
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		if ((*pdpe & PG_PS) != 0)
 			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 		else {
 			pde = pmap_pdpe_to_pde(pdpe, va);
 			if ((*pde & PG_V) != 0) {
 				if ((*pde & PG_PS) != 0) {
 					pa = (*pde & PG_PS_FRAME) |
 					    (va & PDRMASK);
 				} else {
 					pte = pmap_pde_to_pte(pde, va);
 					pa = (*pte & PG_FRAME) |
 					    (va & PAGE_MASK);
 				}
 			}
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde, *pdep;
 	pt_entry_t pte, PG_RW, PG_V;
 	vm_page_t m;
 
 	m = NULL;
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 
 	PMAP_LOCK(pmap);
 	pdep = pmap_pde(pmap, va);
 	if (pdep != NULL && (pde = *pdep)) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0)
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 		} else {
 			pte = *pmap_pde_to_pte(pdep, va);
 			if ((pte & PG_V) != 0 &&
 			    ((pte & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0))
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 		}
 		if (m != NULL && !vm_page_wire_mapped(m))
 			m = NULL;
 	}
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t pde;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
 		pa = pmap_large_map_kextract(va);
 	} else {
 		pde = *vtopde(va);
 		if (pde & PG_PS) {
 			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 		} else {
 			/*
 			 * Beware of a concurrent promotion that changes the
 			 * PDE at this point!  For example, vtopte() must not
 			 * be used to access the PTE because it would use the
 			 * new PDE.  It is, however, safe to use the old PDE
 			 * because the page table page is preserved by the
 			 * promotion.
 			 */
 			pa = *pmap_pde_to_pte(&pde, va);
 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	return (pa);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
 }
 
 static __inline void
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 	int cache_bits;
 
 	pte = vtopte(va);
 	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
 	int cache_bits;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
 		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
 			pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
 		}
 		pte++;
 	}
 	if (__predict_false((oldpte & X86_PG_V) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 	
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "mpte" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= (NUPDE + NUPDPE)) {
 		/* PDP page */
 		pml4_entry_t *pml4;
 		pml4 = pmap_pml4e(pmap, va);
 		*pml4 = 0;
 		if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
 			pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
 			*pml4 = 0;
 		}
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pdp_entry_t *pdp;
 		pdp = pmap_pdpe(pmap, va);
 		*pdp = 0;
 	} else {
 		/* PTE page */
 		pd_entry_t *pd;
 		pd = pmap_pde(pmap, va);
 		*pd = 0;
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUPDE) {
 		/* We just released a PT, unhold the matching PD */
 		vm_page_t pdpg;
 
 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdpg, free);
 	}
 	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 		/* We just released a PD, unhold the matching PDP */
 		vm_page_t pdppg;
 
 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdppg, free);
 	}
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 	struct proc *p;
 	struct thread *td;
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	pmap->pm_pml4u = NULL;
 	pmap->pm_cr3 = KPML4phys;
 	/* hack to keep pmap_pti_pcid_invalidate() alive */
 	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = pmap_flags;
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
 		pmap->pm_pcids[i].pm_gen = 1;
 	}
 	pmap_activate_boot(pmap);
 	td = curthread;
 	if (pti) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		p->p_md.md_flags |= P_MD_KPTI;
 		PROC_UNLOCK(p);
 	}
 	pmap_thread_init_invl_gen(td);
 
 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 		pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
 		    sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 	}
 }
 
 void
 pmap_pinit_pml4(vm_page_t pml4pg)
 {
 	pml4_entry_t *pm_pml4;
 	int i;
 
 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 
 	/* Wire in kernel global address entries. */
 	for (i = 0; i < NKPML4E; i++) {
 		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V;
 	}
 	for (i = 0; i < ndmpdpphys; i++) {
 		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V;
 	}
 
 	/* install self-referential address mapping entry(s) */
 	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
 	    X86_PG_A | X86_PG_M;
 
 	/* install large map entries if configured */
 	for (i = 0; i < lm_ents; i++)
 		pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
 }
 
 static void
 pmap_pinit_pml4_pti(vm_page_t pml4pg)
 {
 	pml4_entry_t *pm_pml4;
 	int i;
 
 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 	for (i = 0; i < NPML4EPG; i++)
 		pm_pml4[i] = pti_pml4[i];
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 int
 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
 	vm_page_t pml4pg, pml4pgu;
 	vm_paddr_t pml4phys;
 	int i;
 
 	/*
 	 * allocate the page directory page
 	 */
 	pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
 
 	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
 	pmap->pm_cr3 = PMAP_NO_CR3;	/* initialize to an invalid value */
 	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_pml4u = NULL;
 
 	pmap->pm_type = pm_type;
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
 	/*
 	 * Do not install the host kernel mappings in the nested page
 	 * tables. These mappings are meaningless in the guest physical
 	 * address space.
 	 * Install minimal kernel mappings in PTI case.
 	 */
 	if (pm_type == PT_X86) {
 		pmap->pm_cr3 = pml4phys;
 		pmap_pinit_pml4(pml4pg);
 		if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {
 			pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 			pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
 			    VM_PAGE_TO_PHYS(pml4pgu));
 			pmap_pinit_pml4_pti(pml4pgu);
 			pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
 		}
 		if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 			rangeset_init(&pmap->pm_pkru, pkru_dup_range,
 			    pkru_free_range, pmap, M_NOWAIT);
 		}
 	}
 
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = flags;
 	pmap->pm_eptgen = 0;
 
 	return (1);
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 
 	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, pdppg, pdpg;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			PMAP_ASSERT_NOT_IN_DI();
 			vm_wait(NULL);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
 		pml4_entry_t *pml4, *pml4u;
 		vm_pindex_t pml4index;
 
 		/* Wire up a new PDPE page */
 		pml4index = ptepindex - (NUPDE + NUPDPE);
 		pml4 = &pmap->pm_pml4[pml4index];
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 		if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
 			/*
 			 * PTI: Make all user-space mappings in the
 			 * kernel-mode page table no-execute so that
 			 * we detect any programming errors that leave
 			 * the kernel-mode page table active on return
 			 * to user space.
 			 */
 			if (pmap->pm_ucr3 != PMAP_NO_CR3)
 				*pml4 |= pg_nx;
 
 			pml4u = &pmap->pm_pml4u[pml4index];
 			*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
 			    PG_A | PG_M;
 		}
 
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 
 		/* Wire up a new PDE page */
 		pdpindex = ptepindex - NUPDE;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			/* Add reference to pdp page */
 			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 			pdppg->wire_count++;
 		}
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 
 		/* Now find the pdp page */
 		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 		pd_entry_t *pd;
 
 		/* Wire up a new PTE page */
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		/* First, find the pdp and check that its valid. */
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		} else {
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 				    lockp) == NULL) {
 					vm_page_unwire_noq(m);
 					vm_page_free_zero(m);
 					return (NULL);
 				}
 			} else {
 				/* Add reference to the pd page */
 				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 				pdpg->wire_count++;
 			}
 		}
 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 
 		/* Now we know where the page directory page is */
 		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pdp_entry_t *pdpe, PG_V;
 	vm_page_t pdpg;
 
 	PG_V = pmap_valid_bit(pmap);
 
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		/* Add a reference to the pd page. */
 		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 		pdpg->wire_count++;
 	} else {
 		/* Allocate a pd page. */
 		ptepindex = pmap_pde_pindex(va);
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 		if (pdpg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (pdpg);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd, PG_V;
 	vm_page_t m;
 
 	PG_V = pmap_valid_bit(pmap);
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_pde_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pd = pmap_pde(pmap, va);
 
 	/*
 	 * This supports switching from a 2MB page to a
 	 * normal 4K page.
 	 */
 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 			/*
 			 * Invalidation of the 2MB page mapping may have caused
 			 * the deallocation of the underlying PD page.
 			 */
 			pd = NULL;
 		}
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pd != NULL && (*pd & PG_V) != 0) {
 		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, lockp);
 		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
 
 	for (i = 0; i < NKPML4E; i++)	/* KVA */
 		pmap->pm_pml4[KPML4BASE + i] = 0;
 	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 		pmap->pm_pml4[DMPML4I + i] = 0;
 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
 	for (i = 0; i < lm_ents; i++)	/* Large Map */
 		pmap->pm_pml4[LMSPML4I + i] = 0;
 
 	vm_page_unwire_noq(m);
 	vm_page_free_zero(m);
 
 	if (pmap->pm_pml4u != NULL) {
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
 		rangeset_fini(&pmap->pm_pkru);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "LU", "Amount of KVM free");
 
 /*
  * Allocate physical memory for the vm_page array and map it into KVA,
  * attempting to back the vm_pages with domain-local memory.
  */
 void
 pmap_page_array_startup(long pages)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde, newpdir;
 	vm_offset_t va, start, end;
 	vm_paddr_t pa;
 	long pfn;
 	int domain, i;
 
 	vm_page_array_size = pages;
 
 	start = va = VM_MIN_KERNEL_ADDRESS;
 	end = va + pages * sizeof(struct vm_page);
 	while (va < end) {
 		pfn = first_page + (va - start) / sizeof(struct vm_page);
 		domain = _vm_phys_domain(ptoa(pfn));
 		pdpe = pmap_pdpe(kernel_pmap, va);
 		if ((*pdpe & X86_PG_V) == 0) {
 			pa = vm_phys_early_alloc(domain, PAGE_SIZE);
 			dump_add_page(pa);
 			pagezero((void *)PHYS_TO_DMAP(pa));
 			*pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW |
 			    X86_PG_A | X86_PG_M);
 		}
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		if ((*pde & X86_PG_V) != 0)
 			panic("Unexpected pde");
 		pa = vm_phys_early_alloc(domain, NBPDR);
 		for (i = 0; i < NPDEPG; i++)
 			dump_add_page(pa + i * PAGE_SIZE);
 		newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A |
 		    X86_PG_M | PG_PS | pg_g | pg_nx);
 		pde_store(pde, newpdir);
 		va += NBPDR;
 	}
 	vm_page_array = (vm_page_t)start;
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *pde, newpdir;
 	pdp_entry_t *pdpe;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	/*
 	 * Return if "addr" is within the range of kernel page table pages
 	 * that were preallocated during pmap bootstrap.  Moreover, leave
 	 * "kernel_vm_end" and the kernel page table as they were.
 	 *
 	 * The correctness of this action is based on the following
 	 * argument: vm_map_insert() allocates contiguous ranges of the
 	 * kernel virtual address space.  It calls this function if a range
 	 * ends after "kernel_vm_end".  If the kernel is mapped between
 	 * "kernel_vm_end" and "addr", then the range cannot begin at
 	 * "kernel_vm_end".  In fact, its beginning address cannot be less
 	 * than the kernel.  Thus, there is no immediate need to allocate
 	 * any new kernel page table pages between "kernel_vm_end" and
 	 * "KERNBASE".
 	 */
 	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 		return;
 
 	addr = roundup2(addr, NBPDR);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 		if ((*pdpe & X86_PG_V) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
 			    X86_PG_A | X86_PG_M);
 			continue; /* try again */
 		}
 		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 		if ((*pde & X86_PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;                       
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 		pde_store(pde, newpdir);
 
 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;                       
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 
 static void
 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
 {
 
 	if (pmap == NULL)
 		return;
 	pmap_invalidate_all(pmap);
 	if (pmap != locked_pmap)
 		PMAP_UNLOCK(pmap);
 	if (start_di)
 		pmap_delayed_invl_finish();
 }
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t next_pmap, pmap;
 	pt_entry_t *pte, tpte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed;
 	bool start_di;
 	static int active_reclaims = 0;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 	pmap = NULL;
 	m_pc = NULL;
 	PG_G = PG_A = PG_M = PG_RW = 0;
 	SLIST_INIT(&free);
 	bzero(&pc_marker_b, sizeof(pc_marker_b));
 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 	pc_marker = (struct pv_chunk *)&pc_marker_b;
 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 
 	/*
 	 * A delayed invalidation block should already be active if
 	 * pmap_advise() or pmap_remove() called this function by way
 	 * of pmap_demote_pde_locked().
 	 */
 	start_di = pmap_not_in_di();
 
 	mtx_lock(&pv_chunks_mutex);
 	active_reclaims++;
 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 	    SLIST_EMPTY(&free)) {
 		next_pmap = pc->pc_pmap;
 		if (next_pmap == NULL) {
 			/*
 			 * The next chunk is a marker.  However, it is
 			 * not our marker, so active_reclaims must be
 			 * > 1.  Consequently, the next_chunk code
 			 * will not rotate the pv_chunks list.
 			 */
 			goto next_chunk;
 		}
 		mtx_unlock(&pv_chunks_mutex);
 
 		/*
 		 * A pv_chunk can only be removed from the pc_lru list
 		 * when both pc_chunks_mutex is owned and the
 		 * corresponding pmap is locked.
 		 */
 		if (pmap != next_pmap) {
 			reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
 			    start_di);
 			pmap = next_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap) {
 				RELEASE_PV_LIST_LOCK(lockp);
 				PMAP_LOCK(pmap);
 				if (start_di)
 					pmap_delayed_invl_start();
 				mtx_lock(&pv_chunks_mutex);
 				continue;
 			} else if (pmap != locked_pmap) {
 				if (PMAP_TRYLOCK(pmap)) {
 					if (start_di)
 						pmap_delayed_invl_start();
 					mtx_lock(&pv_chunks_mutex);
 					continue;
 				} else {
 					pmap = NULL; /* pmap is not locked */
 					mtx_lock(&pv_chunks_mutex);
 					pc = TAILQ_NEXT(pc_marker, pc_lru);
 					if (pc == NULL ||
 					    pc->pc_pmap != next_pmap)
 						continue;
 					goto next_chunk;
 				}
 			} else if (start_di)
 				pmap_delayed_invl_start();
 			PG_G = pmap_global_bit(pmap);
 			PG_A = pmap_accessed_bit(pmap);
 			PG_M = pmap_modified_bit(pmap);
 			PG_RW = pmap_rw_bit(pmap);
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = bsfq(inuse);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va);
 				if ((*pde & PG_PS) != 0)
 					continue;
 				pte = pmap_pde_to_pte(pde, va);
 				if ((*pte & PG_W) != 0)
 					continue;
 				tpte = pte_load_clear(pte);
 				if ((tpte & PG_G) != 0)
 					pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 					vm_page_dirty(m);
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pmap_delayed_invl_page(m);
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, *pde, &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			mtx_lock(&pv_chunks_mutex);
 			goto next_chunk;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap_resident_count_dec(pmap, freed);
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 		    pc->pc_map[2] == PC_FREE2) {
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m_pc->phys_addr);
 			mtx_lock(&pv_chunks_mutex);
 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 			break;
 		}
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		mtx_lock(&pv_chunks_mutex);
 		/* One freed pv entry in locked_pmap is sufficient. */
 		if (pmap == locked_pmap)
 			break;
 next_chunk:
 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
 		if (active_reclaims == 1 && pmap != NULL) {
 			/*
 			 * Rotate the pv chunks list so that we do not
 			 * scan the same pv chunks that could not be
 			 * freed (because they contained a wired
 			 * and/or superpage mapping) on every
 			 * invocation of reclaim_pv_chunk().
 			 */
 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
 				MPASS(pc->pc_pmap != NULL);
 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 			}
 		}
 	}
 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
 	active_reclaims--;
 	mtx_unlock(&pv_chunks_mutex);
 	reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Returns the number of one bits within the given PV chunk map.
  *
  * The erratas for Intel processors state that "POPCNT Instruction May
  * Take Longer to Execute Than Expected".  It is believed that the
  * issue is the spurious dependency on the destination register.
  * Provide a hint to the register rename logic that the destination
  * value is overwritten, by clearing it, as suggested in the
  * optimization manual.  It should be cheap for unaffected processors
  * as well.
  *
  * Reference numbers for erratas are
  * 4th Gen Core: HSD146
  * 5th Gen Core: BDM85
  * 6th Gen Core: SKL029
  */
 static int
 popcnt_pc_map_pq(uint64_t *map)
 {
 	u_long result, tmp;
 
 	__asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
 	    "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
 	    "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
 	    : "=&r" (result), "=&r" (tmp)
 	    : "m" (map[0]), "m" (map[1]), "m" (map[2]));
 	return (result);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free;
 	bool reclaimed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 #ifndef __POPCNT__
 		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
 			bit_count((bitstr_t *)pc->pc_map, 0,
 			    sizeof(pc->pc_map) * NBBY, &free);
 		else
 #endif
 		free = popcnt_pc_map_pq(pc->pc_map);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		}
 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void
 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 	int bit, field;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 	va_last = va + NBPDR - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_pde: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  * replace the many pv entries for the 4KB page mappings by a single pv entry
  * for the 2MB page mapping.
  */
 static void
 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 	 * a transfer avoids the possibility that get_pv_entry() calls
 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = pde & PG_PS_FRAME;
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	return (true);
 }
 
 /*
  * Fills a page table page with mappings to consecutive physical pages.
  */
 static void
 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 {
 	pt_entry_t *pte;
 
 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 		*pte = newpte;
 		newpte += PAGE_SIZE;
 	}
 }
 
 /*
  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
  * mapping is invalidated.
  */
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	lock = NULL;
 	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (rv);
 }
 
 static void
 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused)
 {
 #ifdef INVARIANTS
 #ifdef DIAGNOSTIC
 	pt_entry_t *xpte, *ypte;
 
 	for (xpte = firstpte; xpte < firstpte + NPTEPG;
 	    xpte++, newpte += PAGE_SIZE) {
 		if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) {
 			printf("pmap_demote_pde: xpte %zd and newpte map "
 			    "different pages: found %#lx, expected %#lx\n",
 			    xpte - firstpte, *xpte, newpte);
 			printf("page table dump\n");
 			for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++)
 				printf("%zd %#lx\n", ypte - firstpte, *ypte);
 			panic("firstpte");
 		}
 	}
 #else
 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 	    ("pmap_demote_pde: firstpte and newpte map different physical"
 	    " addresses"));
 #endif
 #endif
 }
 
 static void
 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t oldpde, struct rwlock **lockp)
 {
 	struct spglist free;
 	vm_offset_t sva;
 
 	SLIST_INIT(&free);
 	sva = trunc_2mpage(va);
 	pmap_remove_pde(pmap, pde, sva, &free, lockp);
 	if ((oldpde & pmap_global_bit(pmap)) == 0)
 		pmap_invalidate_pde_page(pmap, sva, oldpde);
 	vm_page_free_pages_toq(&free, true);
 	CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p",
 	    va, pmap);
 }
 
 static boolean_t
 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	int PG_PTE_CACHE;
 	bool in_kernel;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	in_kernel = va >= VM_MAXUSER_ADDRESS;
 	oldpde = *pde;
 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 
 	/*
 	 * Invalidate the 2MB page mapping and return "failure" if the
 	 * mapping was never accessed.
 	 */
 	if ((oldpde & PG_A) == 0) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: a wired mapping is missing PG_A"));
 		pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
 		return (FALSE);
 	}
 
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * If the page table page is missing and the mapping
 		 * is for a kernel address, the mapping must belong to
 		 * the direct map.  Page table pages are preallocated
 		 * for every other part of the kernel address space,
 		 * so the direct map region is the only part of the
 		 * kernel address space that must be handled here.
 		 */
 		KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
 		    va < DMAP_MAX_ADDRESS),
 		    ("pmap_demote_pde: No saved mpte for va %#lx", va));
 
 		/*
 		 * If the 2MB page mapping belongs to the direct map
 		 * region of the kernel's address space, then the page
 		 * allocation request specifies the highest possible
 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
 		 * priority is normal.
 		 */
 		mpte = vm_page_alloc(NULL, pmap_pde_pindex(va),
 		    (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 
 		/*
 		 * If the allocation of the new page table page fails,
 		 * invalidate the 2MB page mapping and return "failure".
 		 */
 		if (mpte == NULL) {
 			pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
 			return (FALSE);
 		}
 
 		if (!in_kernel) {
 			mpte->wire_count = NPTEPG;
 			pmap_resident_count_inc(pmap, 1);
 		}
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pde: oldpde is missing PG_M"));
 	newpte = oldpde & ~PG_PS;
 	newpte = pmap_swap_pat(pmap, newpte);
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * initialize it.
 	 */
 	if (mpte->valid == 0)
 		pmap_fill_ptp(firstpte, newpte);
 
 	pmap_demote_pde_check(firstpte, newpte);
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */
 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 		pmap_fill_ptp(firstpte, newpte);
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
 	 * of the PDE and the PV lists will be inconsistent, which can result
 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
 	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
 	 * PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 
 	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below. 
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else
 		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
 	 */
 	if (in_kernel)
 		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
 
 	atomic_add_long(&pmap_pde_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
 	    va, pmap);
 	return (TRUE);
 }
 
 /*
  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
  */
 static void
 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL)
 		panic("pmap_remove_kernel_pde: Missing pt page.");
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (mpte->valid != 0)
 		pagezero((void *)PHYS_TO_DMAP(mptepa));
 
 	/*
 	 * Demote the mapping.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else
 		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 }
 
 /*
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static int
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_remove_pde: sva is not 2mpage aligned"));
 	oldpde = pte_load_clear(pdq);
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 	if ((oldpde & PG_G) != 0)
 		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 	if (oldpde & PG_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 			pmap_delayed_invl_page(m);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_pde(pmap, pdq, sva);
 	} else {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_pde: pte page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(mpte->wire_count == NPTEPG,
 			    ("pmap_remove_pde: pte page wire count error"));
 			mpte->wire_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 	vm_page_t m;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 		pmap_delayed_invl_page(m);
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     struct spglist *free)
 {
 	struct rwlock *lock;
 	pt_entry_t *pte, PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((*pde & PG_V) == 0)
 		return;
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
 	lock = NULL;
 	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  * Removes the specified range of addresses from the page table page.
  */
 static bool
 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
 {
 	pt_entry_t PG_G, *pte;
 	vm_offset_t va;
 	bool anyvalid;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PG_G = pmap_global_bit(pmap);
 	anyvalid = false;
 	va = eva;
 	for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
 	    sva += PAGE_SIZE) {
 		if (*pte == 0) {
 			if (va != eva) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = eva;
 			}
 			continue;
 		}
 		if ((*pte & PG_G) == 0)
 			anyvalid = true;
 		else if (va == eva)
 			va = sva;
 		if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
 			sva += PAGE_SIZE;
 			break;
 		}
 	}
 	if (va != eva)
 		pmap_invalidate_range(pmap, va, sva);
 	return (anyvalid);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct rwlock *lock;
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t PG_G, PG_V;
 	struct spglist free;
 	int anyvalid;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 	SLIST_INIT(&free);
 
 	pmap_delayed_invl_start();
 	PMAP_LOCK(pmap);
 	pmap_pkru_on_remove(pmap, sva, eva);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pde = pmap_pde(pmap, sva);
 		if (pde && (*pde & PG_PS) == 0) {
 			pmap_remove_page(pmap, sva, pde, &free);
 			goto out;
 		}
 	}
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_remove_pde().
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
 				pmap_remove_pde(pmap, pde, sva, &free, &lock);
 				continue;
 			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
 			    &lock)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			} else
 				ptpaddr = *pde;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
 			anyvalid = 1;
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finish();
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	struct spglist free;
 	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry:
 	rw_wlock(lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pmap_resident_count_dec(pmap, 1);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(lock);
 	pmap_delayed_invl_wait(m);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * pmap_protect_pde: do the things to protect a 2mpage in a process
  */
 static boolean_t
 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 {
 	pd_entry_t newpde, oldpde;
 	vm_page_t m, mt;
 	boolean_t anychanged;
 	pt_entry_t PG_G, PG_M, PG_RW;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_protect_pde: sva is not 2mpage aligned"));
 	anychanged = FALSE;
 retry:
 	oldpde = newpde = *pde;
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 		    (PG_MANAGED | PG_M | PG_RW)) {
 			m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 				vm_page_dirty(mt);
 		}
 		newpde &= ~(PG_RW | PG_M);
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (newpde != oldpde) {
 		/*
 		 * As an optimization to future operations on this PDE, clear
 		 * PG_PROMOTED.  The impending invalidation will remove any
 		 * lingering 4KB page mappings from the TLB.
 		 */
 		if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
 			goto retry;
 		if ((oldpde & PG_G) != 0)
 			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 		else
 			anychanged = TRUE;
 	}
 	return (anychanged);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 	boolean_t anychanged;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	anychanged = FALSE;
 
 	/*
 	 * Although this function delays and batches the invalidation
 	 * of stale TLB entries, it does not need to call
 	 * pmap_delayed_invl_start() and
 	 * pmap_delayed_invl_finish(), because it does not
 	 * ordinarily destroy mappings.  Stale TLB entries from
 	 * protection-only changes need only be invalidated before the
 	 * pmap lock is released, because protection-only changes do
 	 * not destroy PV entries.  Even operations that iterate over
 	 * a physical page's PV list of mappings, like
 	 * pmap_remove_write(), acquire the pmap lock for each
 	 * mapping.  Consequently, for protection-only changes, the
 	 * pmap lock suffices to synchronize both page table and TLB
 	 * updates.
 	 *
 	 * This function only destroys a mapping if pmap_demote_pde()
 	 * fails.  In that case, stale TLB entries are immediately
 	 * invalidated.
 	 */
 	
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_protect_pde().
 				 */
 				if (pmap_protect_pde(pmap, pde, sva, prot))
 					anychanged = TRUE;
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			pt_entry_t obits, pbits;
 			vm_page_t m;
 
 retry:
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
 			}
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_long(pte, obits, pbits))
 					goto retry;
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = TRUE;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * Tries to promote the 512, contiguous 4KB page mappings that are within a
  * single page table page (PTP) to a single 2MB page mapping.  For promotion
  * to occur, two conditions must be met: (1) the 4KB page mappings must map
  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  * identical characteristics. 
  */
 static void
 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
 	vm_page_t mpte;
 	int PG_PTE_CACHE;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 2MB page. 
 	 */
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 setpde:
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
 		 */
 		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 			goto setpde;
 		newpde &= ~PG_RW;
 	}
 
 	/*
 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
 	 * PTE maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE.
 	 */
 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 setpte:
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
 			 */
 			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 				goto setpte;
 			oldpte &= ~PG_RW;
 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
 			    (va & ~PDRMASK), pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the PDE
 	 * mapping the superpage is demoted by pmap_demote_pde() or
 	 * destroyed by pmap_remove_pde(). 
 	 */
 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
 	    ("pmap_promote_pde: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP,
 		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 
 	/*
 	 * Propagate the PAT index to its proper position.
 	 */
 	newpde = pmap_swap_pat(pmap, newpde);
 
 	/*
 	 * Map the superpage.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 	else
 		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 
 	atomic_add_long(&pmap_pde_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  *
  *	When destroying both a page table and PV entry, this function
  *	performs the TLB invalidation before releasing the PV list
  *	lock, so we do not need pmap_delayed_invl_page() calls here.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	int rv;
 	boolean_t nosleep;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	va = trunc_page(va);
 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 	    va));
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
 	    va >= kmi.clean_eva,
 	    ("pmap_enter: managed mapping within the clean submap"));
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
 	    ("pmap_enter: flags %u has reserved bits set", flags));
 	pa = VM_PAGE_TO_PHYS(m);
 	newpte = (pt_entry_t)(pa | PG_A | PG_V);
 	if ((flags & VM_PROT_WRITE) != 0)
 		newpte |= PG_M;
 	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
 
 	/*
 	 * Set modified bit gratuitously for writeable mappings if
 	 * the page is unmanaged. We do not want to take a fault
 	 * to do the dirty bit accounting for these mappings.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		if ((newpte & PG_RW) != 0)
 			newpte |= PG_M;
 	} else
 		newpte |= PG_MANAGED;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */ 
 		KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
 		goto out;
 	}
 	mpte = NULL;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 retry:
 	pde = pmap_pde(pmap, va);
 	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
 	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
 		pte = pmap_pde_to_pte(pde, va);
 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 			mpte->wire_count++;
 		}
 	} else if (va < VM_MAXUSER_ADDRESS) {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
 		    nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			rv = KERN_RESOURCE_SHORTAGE;
 			goto out;
 		}
 		goto retry;
 	} else
 		panic("pmap_enter: invalid page directory va=%#lx", va);
 
 	origpte = *pte;
 	pv = NULL;
 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
 		newpte |= pmap_pkru_get(pmap, va);
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if ((origpte & PG_V) != 0) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		opa = origpte & PG_FRAME;
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((origpte & PG_MANAGED) != 0 &&
 			    (newpte & PG_RW) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 				goto unchanged;
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.  This ensures that all threads sharing the
 		 * pmap keep a consistent view of the mapping, which is
 		 * necessary for the correct handling of COW faults.  It
 		 * also permits reuse of the old mapping's PV entry,
 		 * avoiding an allocation.
 		 *
 		 * For consistency, handle unmanaged mappings the same way.
 		 */
 		origpte = pte_load_clear(pte);
 		KASSERT((origpte & PG_FRAME) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((origpte & PG_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(om);
 			if ((origpte & PG_A) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			KASSERT(pv != NULL,
 			    ("pmap_enter: no PV entry for %#lx", va));
 			if ((newpte & PG_MANAGED) == 0)
 				free_pv_entry(pmap, pv);
 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		if ((origpte & PG_A) != 0)
 			pmap_invalidate_page(pmap, va);
 		origpte = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((newpte & PG_W) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((newpte & PG_MANAGED) != 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((newpte & PG_RW) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Update the PTE.
 	 */
 	if ((origpte & PG_V) != 0) {
 validate:
 		origpte = pte_load_store(pte, newpte);
 		KASSERT((origpte & PG_FRAME) == pa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
 		    (PG_M | PG_RW)) {
 			if ((origpte & PG_MANAGED) != 0)
 				vm_page_dirty(m);
 
 			/*
 			 * Although the PTE may still have PG_RW set, TLB
 			 * invalidation may nonetheless be required because
 			 * the PTE no longer has PG_M set.
 			 */
 		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 			/*
 			 * This PTE change does not require TLB invalidation.
 			 */
 			goto unchanged;
 		}
 		if ((origpte & PG_A) != 0)
 			pmap_invalidate_page(pmap, va);
 	} else
 		pte_store(pte, newpte);
 
 unchanged:
 
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * If both the page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pde(pmap, pde, va, &lock);
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PG_V = pmap_valid_bit(pmap);
 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 	    PG_PS | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpde |= PG_MANAGED;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpde |= PG_U;
 	return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t PG_G, PG_RW, PG_V;
 	vm_page_t mt, pdpg;
 
 	KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0,
 	    ("pmap_enter_pde: cannot create wired user mapping"));
 	PG_G = pmap_global_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
 	    ("pmap_enter_pde: newpde is missing PG_M"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 	    NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	/*
 	 * If pkru is not same for the whole pde range, return failure
 	 * and let vm_fault() cope.  Check after pde allocation, since
 	 * it could sleep.
 	 */
 	if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
 		SLIST_INIT(&free);
 		if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
 			pmap_invalidate_page(pmap, va);
 			vm_page_free_pages_toq(&free, true);
 		}
 		return (KERN_FAILURE);
 	}
 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
 		newpde &= ~X86_PG_PKU_MASK;
 		newpde |= pmap_pkru_get(pmap, va);
 	}
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 	pde = &pde[pmap_pde_index(va)];
 	oldpde = *pde;
 	if ((oldpde & PG_V) != 0) {
 		KASSERT(pdpg->wire_count > 1,
 		    ("pmap_enter_pde: pdpg's wire count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			pdpg->wire_count--;
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_FAILURE);
 		}
 		/* Break the existing mapping(s). */
 		SLIST_INIT(&free);
 		if ((oldpde & PG_PS) != 0) {
 			/*
 			 * The reference to the PD page that was acquired by
 			 * pmap_allocpde() ensures that it won't be freed.
 			 * However, if the PDE resulted from a promotion, then
 			 * a reserved PT page could be freed.
 			 */
 			(void)pmap_remove_pde(pmap, pde, va, &free, lockp);
 			if ((oldpde & PG_G) == 0)
 				pmap_invalidate_pde_page(pmap, va, oldpde);
 		} else {
 			pmap_delayed_invl_start();
 			if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
 			    lockp))
 		               pmap_invalidate_all(pmap);
 			pmap_delayed_invl_finish();
 		}
 		vm_page_free_pages_toq(&free, true);
 		if (va >= VM_MAXUSER_ADDRESS) {
 			/*
 			 * Both pmap_remove_pde() and pmap_remove_ptes() will
 			 * leave the kernel page table page zero filled.
 			 */
 			mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_pde: trie insert failed");
 		} else
 			KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
 			    pde));
 	}
 	if ((newpde & PG_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-
 				 * structure caches could nonetheless have
 				 * entries that refer to the freed page table
 				 * pages.  Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((newpde & PG_RW) != 0) {
 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 		}
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((newpde & PG_W) != 0)
 		pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
 	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 
 	/*
 	 * Map the superpage.  (This is not a promoted mapping; there will not
 	 * be any lingering 4KB page mappings in the TLB.)
 	 */
 	pde_store(pde, newpde);
 
 	atomic_add_long(&pmap_pde_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 			    mpte, &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	struct spglist free;
 	pt_entry_t newpte, *pte, PG_V;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t ptepindex;
 		pd_entry_t *ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = pmap_pde_pindex(va);
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap_pde(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
 					return (NULL);
 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		pte = &pte[pmap_pte_index(va)];
 	} else {
 		mpte = NULL;
 		pte = vtopte(va);
 	}
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-
 				 * structure caches could nonetheless have
 				 * entries that refer to the freed page table
 				 * pages.  Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	newpte = VM_PAGE_TO_PHYS(m) | PG_V |
 	    pmap_cache_bits(pmap, m->md.pat_mode, 0);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpte |= PG_MANAGED;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U | pmap_pkru_get(pmap, va);
 	pte_store(pte, newpte);
 	return (mpte);
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 	pd_entry_t *pde;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p, pdpg;
 	int pat_mode;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 		if (!pmap_ps_enabled(pmap))
 			return;
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("pmap_object_init_pt: invalid page %p", p));
 		pat_mode = p->md.pat_mode;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 2MB page boundary.
 		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		/*
 		 * Skip the first page.  Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_object_init_pt: invalid page %p", p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    pat_mode != p->md.pat_mode)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		/*
 		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
 		 * will not affect the termination of this loop.
 		 */ 
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 		    pa < ptepa + size; pa += NBPDR) {
 			pdpg = pmap_allocpde(pmap, addr, NULL);
 			if (pdpg == NULL) {
 				/*
 				 * The creation of mappings below is only an
 				 * optimization.  If a page directory page
 				 * cannot be allocated without blocking,
 				 * continue on to the next mapping rather than
 				 * blocking.
 				 */
 				addr += NBPDR;
 				continue;
 			}
 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 			pde = &pde[pmap_pde_index(addr)];
 			if ((*pde & PG_V) == 0) {
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 				atomic_add_long(&pmap_pde_mappings, 1);
 			} else {
 				/* Continue on if the PDE is already valid. */
 				pdpg->wire_count--;
 				KASSERT(pdpg->wire_count > 0,
 				    ("pmap_object_init_pt: missing reference "
 				    "to page directory page, va: 0x%lx", addr));
 			}
 			addr += NBPDR;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware
  *	feature, so there is no need to invalidate any TLB entries.
  *	Since pmap_demote_pde() for the wired entry must never fail,
  *	pmap_delayed_invl_start()/finish() calls around the
  *	function are not needed.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		if ((*pde & PG_V) == 0)
 			continue;
 		if ((*pde & PG_PS) != 0) {
 			if ((*pde & PG_W) == 0)
 				panic("pmap_unwire: pde %#jx is missing PG_W",
 				    (uintmax_t)*pde);
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				atomic_clear_long(pde, PG_W);
 				pmap->pm_stats.wired_count -= NBPDR /
 				    PAGE_SIZE;
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, sva))
 				panic("pmap_unwire: demotion failed");
 		}
 		if (va_next > eva)
 			va_next = eva;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & PG_V) == 0)
 				continue;
 			if ((*pte & PG_W) == 0)
 				panic("pmap_unwire: pte %#jx is missing PG_W",
 				    (uintmax_t)*pte);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 */
 			atomic_clear_long(pte, PG_W);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct rwlock *lock;
 	struct spglist free;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde, srcptepaddr;
 	pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte;
 	vm_offset_t addr, end_addr, va_next;
 	vm_page_t dst_pdpg, dstmpte, srcmpte;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (dst_pmap->pm_type != src_pmap->pm_type)
 		return;
 
 	/*
 	 * EPT page table entries that require emulation of A/D bits are
 	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
 	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
 	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
 	 * implementations flag an EPT misconfiguration for exec-only
 	 * mappings we skip this function entirely for emulated pmaps.
 	 */
 	if (pmap_emulate_ad_bits(dst_pmap))
 		return;
 
 	end_addr = src_addr + len;
 	lock = NULL;
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 
 	PG_A = pmap_accessed_bit(dst_pmap);
 	PG_M = pmap_modified_bit(dst_pmap);
 	PG_V = pmap_valid_bit(dst_pmap);
 
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		KASSERT(addr < UPT_MIN_ADDRESS,
 		    ("pmap_copy: invalid to pmap_copy page tables"));
 
 		pml4e = pmap_pml4e(src_pmap, addr);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (addr + NBPML4) & ~PML4MASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (addr + NBPDP) & ~PDPMASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		va_next = (addr + NBPDR) & ~PDRMASK;
 		if (va_next < addr)
 			va_next = end_addr;
 
 		pde = pmap_pdpe_to_pde(pdpe, addr);
 		srcptepaddr = *pde;
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 				continue;
 			dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
 			if (dst_pdpg == NULL)
 				break;
 			pde = (pd_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
 			pde = &pde[pmap_pde_index(addr)];
 			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
 			    PMAP_ENTER_NORECLAIM, &lock))) {
 				*pde = srcptepaddr & ~PG_W;
 				pmap_resident_count_inc(dst_pmap, NBPDR /
 				    PAGE_SIZE);
 				atomic_add_long(&pmap_pde_mappings, 1);
 			} else
 				dst_pdpg->wire_count--;
 			continue;
 		}
 
 		srcptepaddr &= PG_FRAME;
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		KASSERT(srcmpte->wire_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 
 		if (va_next > end_addr)
 			va_next = end_addr;
 
 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 		src_pte = &src_pte[pmap_pte_index(addr)];
 		dstmpte = NULL;
 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
 			ptetemp = *src_pte;
 
 			/*
 			 * We only virtual copy managed pages.
 			 */
 			if ((ptetemp & PG_MANAGED) == 0)
 				continue;
 
 			if (dstmpte != NULL) {
 				KASSERT(dstmpte->pindex ==
 				    pmap_pde_pindex(addr),
 				    ("dstmpte pindex/addr mismatch"));
 				dstmpte->wire_count++;
 			} else if ((dstmpte = pmap_allocpte(dst_pmap, addr,
 			    NULL)) == NULL)
 				goto out;
 			dst_pte = (pt_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 			dst_pte = &dst_pte[pmap_pte_index(addr)];
 			if (*dst_pte == 0 &&
 			    pmap_try_insert_pv_entry(dst_pmap, addr,
 			    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) {
 				/*
 				 * Clear the wired, modified, and accessed
 				 * (referenced) bits during the copy.
 				 */
 				*dst_pte = ptetemp & ~(PG_W | PG_M | PG_A);
 				pmap_resident_count_inc(dst_pmap, 1);
 			} else {
 				SLIST_INIT(&free);
 				if (pmap_unwire_ptp(dst_pmap, addr, dstmpte,
 				    &free)) {
 					/*
 					 * Although "addr" is not mapped,
 					 * paging-structure caches could
 					 * nonetheless have entries that refer
 					 * to the freed page table pages.
 					 * Invalidate those entries.
 					 */
 					pmap_invalidate_page(dst_pmap, addr);
 					vm_page_free_pages_toq(&free, true);
 				}
 				goto out;
 			}
 			/* Have we copied all of the valid mappings? */ 
 			if (dstmpte->wire_count >= srcmpte->wire_count)
 				break;
 		}
 	}
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 int
 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
 {
 	int error;
 
 	if (dst_pmap->pm_type != src_pmap->pm_type ||
 	    dst_pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
 		return (0);
 	for (;;) {
 		if (dst_pmap < src_pmap) {
 			PMAP_LOCK(dst_pmap);
 			PMAP_LOCK(src_pmap);
 		} else {
 			PMAP_LOCK(src_pmap);
 			PMAP_LOCK(dst_pmap);
 		}
 		error = pmap_pkru_copy(dst_pmap, src_pmap);
 		/* Clean up partial copy on failure due to no memory. */
 		if (error == ENOMEM)
 			pmap_pkru_deassign_all(dst_pmap);
 		PMAP_UNLOCK(src_pmap);
 		PMAP_UNLOCK(dst_pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 /*
  * Zero the specified hardware page.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  * Zero an an area within a single hardware page.  off and size must not
  * cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  * Copy 1 specified hardware page to another.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t pages[2];
 	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
 	int cnt;
 	boolean_t mapped;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		pages[0] = ma[a_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
 		pages[1] = mb[b_offset >> PAGE_SHIFT];
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
 		a_cp = (char *)vaddr[0] + a_pg_offset;
 		b_cp = (char *)vaddr[1] + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
 		if (__predict_false(mapped))
 			pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct rwlock *lock;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 	int count, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		if ((*pte & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pde(pmap, pv->pv_va);
 			if ((*pte & PG_W) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	return (count);
 }
 
 /*
  * Returns TRUE if the given page is mapped individually or as part of
  * a 2mpage.  Otherwise, returns FALSE.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  *
  * Although this function destroys all of the pmap's managed,
  * non-wired mappings, it can delay and batch the invalidation of TLB
  * entries without calling pmap_delayed_invl_start() and
  * pmap_delayed_invl_finish().  Because the pmap is not active on
  * any other processor, none of these TLB entries will ever be used
  * before their eventual invalidation.  Consequently, there is no need
  * for either pmap_remove_all() or pmap_remove_write() to wait for
  * that eventual TLB invalidation.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pd_entry_t ptepde;
 	pt_entry_t *pte, tpte;
 	pt_entry_t PG_M, PG_RW, PG_V;
 	struct spglist free;
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx;
 	boolean_t superpage;
 	vm_paddr_t pa;
 
 	/*
 	 * Assert that the given pmap is only active on the current
 	 * CPU.  Unfortunately, we cannot block another CPU from
 	 * activating the pmap while this function is executing.
 	 */
 	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
 #ifdef INVARIANTS
 	{
 		cpuset_t other_cpus;
 
 		other_cpus = all_cpus;
 		critical_enter();
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		critical_exit();
 		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
 	}
 #endif
 
 	lock = NULL;
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	SLIST_INIT(&free);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfq(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_pdpe(pmap, pv->pv_va);
 				ptepde = *pte;
 				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 				tpte = *pte;
 				if ((tpte & (PG_PS | PG_V)) == PG_V) {
 					superpage = FALSE;
 					ptepde = tpte;
 					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 					    PG_FRAME);
 					pte = &pte[pmap_pte_index(pv->pv_va)];
 					tpte = *pte;
 				} else {
 					/*
 					 * Keep track whether 'tpte' is a
 					 * superpage explicitly instead of
 					 * relying on PG_PS being set.
 					 *
 					 * This is because PG_PS is numerically
 					 * identical to PG_PTE_PAT and thus a
 					 * regular page could be mistaken for
 					 * a superpage.
 					 */
 					superpage = TRUE;
 				}
 
 				if ((tpte & PG_V) == 0) {
 					panic("bad pte va %lx pte %lx",
 					    pv->pv_va, tpte);
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				if (superpage)
 					pa = tpte & PG_PS_FRAME;
 				else
 					pa = tpte & PG_FRAME;
 
 				m = PHYS_TO_VM_PAGE(pa);
 				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad tpte %#jx",
 				    (uintmax_t)tpte));
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 					if (superpage) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
 						vm_page_dirty(m);
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 				if (superpage) {
 					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 					if (mpte != NULL) {
 						KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 						    ("pmap_remove_pages: pte page not promoted"));
 						pmap_resident_count_dec(pmap, 1);
 						KASSERT(mpte->wire_count == NPTEPG,
 						    ("pmap_remove_pages: pte page wire count error"));
 						mpte->wire_count = 0;
 						pmap_add_delayed_free_list(mpte, &free, FALSE);
 					}
 				} else {
 					pmap_resident_count_dec(pmap, 1);
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 					m->md.pv_gen++;
 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m, PGA_WRITEABLE);
 					}
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
 	pmap_pkru_deassign_all(pmap);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 static boolean_t
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct rwlock *lock;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	pt_entry_t *pte, mask;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	pmap_t pmap;
 	int md_gen, pvh_gen;
 	boolean_t rv;
 
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		mask = 0;
 		if (modified) {
 			PG_M = pmap_modified_bit(pmap);
 			PG_RW = pmap_rw_bit(pmap);
 			mask |= PG_RW | PG_M;
 		}
 		if (accessed) {
 			PG_A = pmap_accessed_bit(pmap);
 			PG_V = pmap_valid_bit(pmap);
 			mask |= PG_V | PG_A;
 		}
 		rv = (*pte & mask) == mask;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pde(pmap, pv->pv_va);
 			mask = 0;
 			if (modified) {
 				PG_M = pmap_modified_bit(pmap);
 				PG_RW = pmap_rw_bit(pmap);
 				mask |= PG_RW | PG_M;
 			}
 			if (accessed) {
 				PG_A = pmap_accessed_bit(pmap);
 				PG_V = pmap_valid_bit(pmap);
 				mask |= PG_V | PG_A;
 			}
 			rv = (*pte & mask) == mask;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	boolean_t rv;
 
 	PG_V = pmap_valid_bit(pmap);
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
 	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 		pte = pmap_pde_to_pte(pde, addr);
 		rv = (*pte & PG_V) == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *pde;
 	pt_entry_t oldpte, *pte, PG_M, PG_RW;
 	vm_offset_t va;
 	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		if ((*pde & PG_RW) != 0)
 			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen ||
 			    md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if (oldpte & PG_RW) {
 			if (!atomic_cmpset_long(pte, oldpte, oldpte &
 			    ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	pmap_delayed_invl_wait(m);
 }
 
 static __inline boolean_t
 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
 {
 
 	if (!pmap_emulate_ad_bits(pmap))
 		return (TRUE);
 
 	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
 
 	/*
 	 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
 	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
 	 * if the EPT_PG_WRITE bit is set.
 	 */
 	if ((pte & EPT_PG_WRITE) != 0)
 		return (FALSE);
 
 	/*
 	 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
 	 */
 	if ((pte & EPT_PG_EXECUTE) == 0 ||
 	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  *
  *	A DI block is not needed within this function, because
  *	invalidations are performed before the PV list lock is
  *	released.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A, PG_M, PG_RW;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int cleared, md_gen, not_cleared, pvh_gen;
 	struct spglist free;
 	boolean_t demoted;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, pv->pv_va);
 		oldpde = *pde;
 		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Although "oldpde" is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((oldpde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
 			 * pages, it should not be cleared every time it is
 			 * tested.  Apply a simple "hash" function on the
 			 * physical page number, the virtual superpage number,
 			 * and the pmap address to select one 4KB page out of
 			 * the 512 on which testing the reference bit will
 			 * result in clearing that reference bit.  This
 			 * function is designed to avoid the selection of the
 			 * same 4KB page for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 			    (oldpde & PG_W) == 0) {
 				if (safe_to_clear_referenced(pmap, oldpde)) {
 					atomic_clear_long(pde, PG_A);
 					pmap_invalidate_page(pmap, pv->pv_va);
 					demoted = FALSE;
 				} else if (pmap_demote_pde_locked(pmap, pde,
 				    pv->pv_va, &lock)) {
 					/*
 					 * Remove the mapping to a single page
 					 * so that a subsequent access may
 					 * repromote.  Since the underlying
 					 * page table page is fully populated,
 					 * this removal never frees a page
 					 * table page.
 					 */
 					demoted = TRUE;
 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
 					    PG_PS_FRAME);
 					pte = pmap_pde_to_pte(pde, va);
 					pmap_remove_pte(pmap, pte, va, *pde,
 					    NULL, &lock);
 					pmap_invalidate_page(pmap, va);
 				} else
 					demoted = TRUE;
 
 				if (demoted) {
 					/*
 					 * The superpage mapping was removed
 					 * entirely and therefore 'pv' is no
 					 * longer valid.
 					 */
 					if (pvf == pv)
 						pvf = NULL;
 					pv = NULL;
 				}
 				cleared++;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if ((*pte & PG_A) != 0) {
 			if (safe_to_clear_referenced(pmap, *pte)) {
 				atomic_clear_long(pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else if ((*pte & PG_W) == 0) {
 				/*
 				 * Wired pages cannot be paged out so
 				 * doing accessed bit emulation for
 				 * them is wasted effort. We do the
 				 * hard work for unwired pages only.
 				 */
 				pmap_remove_pte(pmap, pte, pv->pv_va,
 				    *pde, &free, &lock);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 				if (pvf == pv)
 					pvf = NULL;
 				pv = NULL;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	struct rwlock *lock;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
 	vm_offset_t va, va_next;
 	vm_page_t m;
 	bool anychanged;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 
 	/*
 	 * A/D bit emulation requires an alternate code path when clearing
 	 * the modified and accessed bits below. Since this function is
 	 * advisory in nature we skip it entirely for pmaps that require
 	 * A/D bit emulation.
 	 */
 	if (pmap_emulate_ad_bits(pmap))
 		return;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	anychanged = false;
 	pmap_delayed_invl_start();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		oldpde = *pde;
 		if ((oldpde & PG_V) == 0)
 			continue;
 		else if ((oldpde & PG_PS) != 0) {
 			if ((oldpde & PG_MANAGED) == 0)
 				continue;
 			lock = NULL;
 			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
 				if (lock != NULL)
 					rw_wunlock(lock);
 
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Choosing the last page
 			 * within the address range [sva, min(va_next, eva))
 			 * generally results in more repromotions.  Since the
 			 * underlying page table page is fully populated, this
 			 * removal never frees a page table page.
 			 */
 			if ((oldpde & PG_W) == 0) {
 				va = eva;
 				if (va > va_next)
 					va = va_next;
 				va -= PAGE_SIZE;
 				KASSERT(va >= sva,
 				    ("pmap_advise: no address gap"));
 				pte = pmap_pde_to_pte(pde, va);
 				KASSERT((*pte & PG_V) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_pte(pmap, pte, va, *pde, NULL,
 				    &lock);
 				anychanged = true;
 			}
 			if (lock != NULL)
 				rw_wunlock(lock);
 		}
 		if (va_next > eva)
 			va_next = eva;
 		va = va_next;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 				goto maybe_invlrng;
 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				atomic_clear_long(pte, PG_M | PG_A);
 			} else if ((*pte & PG_A) != 0)
 				atomic_clear_long(pte, PG_A);
 			else
 				goto maybe_invlrng;
 
 			if ((*pte & PG_G) != 0) {
 				if (va == va_next)
 					va = sva;
 			} else
 				anychanged = true;
 			continue;
 maybe_invlrng:
 			if (va != va_next) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = va_next;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finish();
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_M, PG_RW;
 	struct rwlock *lock;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		oldpde = *pde;
 		/* If oldpde has PG_RW set, then it also has PG_M set. */
 		if ((oldpde & PG_RW) != 0 &&
 		    pmap_demote_pde_locked(pmap, pde, va, &lock) &&
 		    (oldpde & PG_W) == 0) {
 			/*
 			 * Write protect the mapping to a single page so that
 			 * a subsequent write access may repromote.
 			 */
 			va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
 			pte = pmap_pde_to_pte(pde, va);
 			atomic_clear_long(pte, PG_M | PG_RW);
 			vm_page_dirty(m);
 			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			atomic_clear_long(pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
 {
 	u_int opte, npte;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = *(u_int *)pte;
 		npte = opte & ~mask;
 		npte |= cache_bits;
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
 
 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 static __inline void
 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
 {
 	u_int opde, npde;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PDE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opde = *(u_int *)pde;
 		npde = opde & ~mask;
 		npde |= cache_bits;
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 static void *
 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	vm_size_t tmpsize;
 	int i;
 
 	offset = pa & PAGE_MASK;
 	size = round_page(offset + size);
 	pa = trunc_page(pa);
 
 	if (!pmap_initialized) {
 		va = 0;
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->va == 0) {
 				ppim->pa = pa;
 				ppim->sz = size;
 				ppim->mode = mode;
 				ppim->va = virtual_avail;
 				virtual_avail += size;
 				va = ppim->va;
 				break;
 			}
 		}
 		if (va == 0)
 			panic("%s: too many preinit mappings", __func__);
 	} else {
 		/*
 		 * If we have a preinit mapping, re-use it.
 		 */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->pa == pa && ppim->sz == size &&
 			    (ppim->mode == mode ||
 			    (flags & MAPDEV_SETATTR) == 0))
 				return ((void *)(ppim->va + offset));
 		}
 		/*
 		 * If the specified range of physical addresses fits within
 		 * the direct map window, use the direct map.
 		 */
 		if (pa < dmaplimit && pa + size <= dmaplimit) {
 			va = PHYS_TO_DMAP(pa);
 			if ((flags & MAPDEV_SETATTR) != 0) {
 				PMAP_LOCK(kernel_pmap);
 				i = pmap_change_attr_locked(va, size, mode, flags);
 				PMAP_UNLOCK(kernel_pmap);
 			} else
 				i = 0;
 			if (!i)
 				return ((void *)(va + offset));
 		}
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 	}
 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 	if ((flags & MAPDEV_FLUSHCACHE) != 0)
 		pmap_invalidate_cache_range(va, va + tmpsize);
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 
 	return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE |
 	    MAPDEV_SETATTR));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE,
 	    MAPDEV_SETATTR));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK,
 	    MAPDEV_FLUSHCACHE));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset;
 	int i;
 
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return;
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 	va = trunc_page(va);
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va && ppim->sz == size) {
 			if (pmap_initialized)
 				return;
 			ppim->pa = 0;
 			ppim->va = 0;
 			ppim->sz = 0;
 			ppim->mode = 0;
 			if (va + size == virtual_avail)
 				virtual_avail = va;
 			return;
 		}
 	}
 	if (pmap_initialized)
 		kva_free(va, size);
 }
 
 /*
  * Tries to demote a 1GB page mapping.
  */
 static boolean_t
 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pdp_entry_t newpdpe, oldpdpe;
 	pd_entry_t *firstpde, newpde, *pde;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pdpgpa;
 	vm_page_t pdpg;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpdpe = *pdpe;
 	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 	if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	pdpgpa = VM_PAGE_TO_PHYS(pdpg);
 	firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
 	newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpdpe & PG_A) != 0,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 	newpde = oldpdpe;
 
 	/*
 	 * Initialize the page directory page.
 	 */
 	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 		*pde = newpde;
 		newpde += NBPDR;
 	}
 
 	/*
 	 * Demote the mapping.
 	 */
 	*pdpe = newpdpe;
 
 	/*
 	 * Invalidate a stale recursive mapping of the page directory page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
 
 	pmap_pdpe_demotions++;
 	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pat_mode = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 	    m->md.pat_mode))
 		panic("memory attribute change on the direct map failed");
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the direct map or the kernel map.  If
  * the virtual address range is contained within the kernel map, then the
  * memory type for each of the corresponding ranges of the direct map is also
  * changed.  (The corresponding ranges of the direct map are those ranges that
  * map the same physical pages as the specified virtual address range.)  These
  * changes to the direct map are necessary because Intel describes the
  * behavior of their processors as "undefined" if two or more mappings to the
  * same physical page have different memory types.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.  In the
  * latter case, the memory type may have been changed on some part of the
  * virtual address range or the direct map.
  */
 int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	int error;
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_attr_locked(va, size, mode, MAPDEV_FLUSHCACHE);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, int flags)
 {
 	vm_offset_t base, offset, tmpva;
 	vm_paddr_t pa_start, pa_end, pa_end1;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	int cache_bits_pte, cache_bits_pde, error;
 	boolean_t changed;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses, including the direct
 	 * map but excluding the recursive map.
 	 */
 	if (base < DMAP_MIN_ADDRESS)
 		return (EINVAL);
 
 	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
 	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
 	changed = FALSE;
 
 	/*
 	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 	 * into 4KB pages if required.
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (pdpe == NULL || *pdpe == 0)
 			return (EINVAL);
 		if (*pdpe & PG_PS) {
 			/*
 			 * If the current 1GB page already has the required
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 1GB page frame.
 			 */
 			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_1gpage(tmpva) + NBPDP;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 1GB page frame
 			 * and there is at least 1GB left within the range, then
 			 * we need not break down this page into 2MB pages.
 			 */
 			if ((tmpva & PDPMASK) == 0 &&
 			    tmpva + PDPMASK < base + size) {
 				tmpva += NBPDP;
 				continue;
 			}
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
 				return (ENOMEM);
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde == 0)
 			return (EINVAL);
 		if (*pde & PG_PS) {
 			/*
 			 * If the current 2MB page already has the required
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 2MB page frame.
 			 */
 			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_2mpage(tmpva) + NBPDR;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 2MB page frame
 			 * and there is at least 2MB left within the range, then
 			 * we need not break down this page into 4KB pages.
 			 */
 			if ((tmpva & PDRMASK) == 0 &&
 			    tmpva + PDRMASK < base + size) {
 				tmpva += NBPDR;
 				continue;
 			}
 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
 				return (ENOMEM);
 		}
 		pte = pmap_pde_to_pte(pde, tmpva);
 		if (*pte == 0)
 			return (EINVAL);
 		tmpva += PAGE_SIZE;
 	}
 	error = 0;
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode if required.
 	 */
 	pa_start = pa_end = 0;
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (*pdpe & PG_PS) {
 			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pdpe, cache_bits_pde,
 				    X86_PG_PDE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
 					pa_end = pa_start + NBPDP;
 				} else if (pa_end == (*pdpe & PG_PS_FRAME))
 					pa_end += NBPDP;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, flags);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
 					pa_end = pa_start + NBPDP;
 				}
 			}
 			tmpva = trunc_1gpage(tmpva) + NBPDP;
 			continue;
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde & PG_PS) {
 			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pde, cache_bits_pde,
 				    X86_PG_PDE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pde & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
 					pa_end = pa_start + NBPDR;
 				} else if (pa_end == (*pde & PG_PS_FRAME))
 					pa_end += NBPDR;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, flags);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
 					pa_end = pa_start + NBPDR;
 				}
 			}
 			tmpva = trunc_2mpage(tmpva) + NBPDR;
 		} else {
 			pte = pmap_pde_to_pte(pde, tmpva);
 			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
 				pmap_pte_attr(pte, cache_bits_pte,
 				    X86_PG_PTE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pte & PG_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				} else if (pa_end == (*pte & PG_FRAME))
 					pa_end += PAGE_SIZE;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, flags);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				}
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
 		pa_end1 = MIN(pa_end, dmaplimit);
 		if (pa_start != pa_end1)
 			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
 			    pa_end1 - pa_start, mode, flags);
 	}
 
 	/*
 	 * Flush CPU caches if required to make sure any data isn't cached that
 	 * shouldn't be, etc.
 	 */
 	if (changed) {
 		pmap_invalidate_range(kernel_pmap, base, tmpva);
 		if ((flags & MAPDEV_FLUSHCACHE) != 0)
 			pmap_invalidate_cache_range(base, tmpva);
 	}
 	return (error);
 }
 
 /*
  * Demotes any mapping within the direct map region that covers more than the
  * specified range of physical addresses.  This range's size must be a power
  * of two and its starting address must be a multiple of its size.  Since the
  * demotion does not change any attributes of the mapping, a TLB invalidation
  * is not mandatory.  The caller may, however, request a TLB invalidation.
  */
 void
 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	boolean_t changed;
 
 	if (len == 0)
 		return;
 	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
 	KASSERT((base & (len - 1)) == 0,
 	    ("pmap_demote_DMAP: base is not a multiple of len"));
 	if (len < NBPDP && base < dmaplimit) {
 		va = PHYS_TO_DMAP(base);
 		changed = FALSE;
 		PMAP_LOCK(kernel_pmap);
 		pdpe = pmap_pdpe(kernel_pmap, va);
 		if ((*pdpe & X86_PG_V) == 0)
 			panic("pmap_demote_DMAP: invalid PDPE");
 		if ((*pdpe & PG_PS) != 0) {
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
 				panic("pmap_demote_DMAP: PDPE failed");
 			changed = TRUE;
 		}
 		if (len < NBPDR) {
 			pde = pmap_pdpe_to_pde(pdpe, va);
 			if ((*pde & X86_PG_V) == 0)
 				panic("pmap_demote_DMAP: invalid PDE");
 			if ((*pde & PG_PS) != 0) {
 				if (!pmap_demote_pde(kernel_pmap, pde, va))
 					panic("pmap_demote_DMAP: PDE failed");
 				changed = TRUE;
 			}
 		}
 		if (changed && invalidate)
 			pmap_invalidate_page(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pd_entry_t *pdep;
 	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa;
 	int val;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, addr);
 	if (pdep != NULL && (*pdep & PG_V)) {
 		if (*pdep & PG_PS) {
 			pte = *pdep;
 			/* Compute the physical address of the 4KB page. */
 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 			    PG_FRAME;
 			val = MINCORE_SUPER;
 		} else {
 			pte = *pmap_pde_to_pte(pdep, addr);
 			pa = pte & PG_FRAME;
 			val = 0;
 		}
 	} else {
 		pte = 0;
 		pa = 0;
 		val = 0;
 	}
 	if ((pte & PG_V) != 0) {
 		val |= MINCORE_INCORE;
 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((pte & PG_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 static uint64_t
 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
 {
 	uint32_t gen, new_gen, pcid_next;
 
 	CRITICAL_ASSERT(curthread);
 	gen = PCPU_GET(pcid_gen);
 	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN)
 		return (pti ? 0 : CR3_PCID_SAVE);
 	if (pmap->pm_pcids[cpuid].pm_gen == gen)
 		return (CR3_PCID_SAVE);
 	pcid_next = PCPU_GET(pcid_next);
 	KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
 	    (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
 	    ("cpu %d pcid_next %#x", cpuid, pcid_next));
 	if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
 	    (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
 		new_gen = gen + 1;
 		if (new_gen == 0)
 			new_gen = 1;
 		PCPU_SET(pcid_gen, new_gen);
 		pcid_next = PMAP_PCID_KERN + 1;
 	} else {
 		new_gen = gen;
 	}
 	pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
 	pmap->pm_pcids[cpuid].pm_gen = new_gen;
 	PCPU_SET(pcid_next, pcid_next + 1);
 	return (0);
 }
 
 static uint64_t
 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid)
 {
 	uint64_t cached;
 
 	cached = pmap_pcid_alloc(pmap, cpuid);
 	KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
 	    ("pmap %p cpu %d pcid %#x", pmap, cpuid,
 	    pmap->pm_pcids[cpuid].pm_pcid));
 	KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
 	    pmap == kernel_pmap,
 	    ("non-kernel pmap pmap %p cpu %d pcid %#x",
 	    pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
 	return (cached);
 }
 
 static void
 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap)
 {
 
 	PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ?
 	    PCPU_GET(pti_rsp0) : (uintptr_t)td->td_pcb;
 }
 
 static void inline
 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t cached, cr3, kcr3, ucr3;
 
 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
 	cr3 = rcr3();
 	if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
 		load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid);
 	PCPU_SET(curpmap, pmap);
 	kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
 	ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
 	    PMAP_PCID_USER_PT;
 
 	if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) {
 		/*
 		 * Explicitly invalidate translations cached from the
 		 * user page table.  They are not automatically
 		 * flushed by reload of cr3 with the kernel page table
 		 * pointer above.
 		 *
 		 * Note that the if() condition is resolved statically
 		 * by using the function argument instead of
 		 * runtime-evaluated invpcid_works value.
 		 */
 		if (invpcid_works1) {
 			d.pcid = PMAP_PCID_USER_PT |
 			    pmap->pm_pcids[cpuid].pm_pcid;
 			d.pad = 0;
 			d.addr = 0;
 			invpcid(&d, INVPCID_CTX);
 		} else {
 			pmap_pti_pcid_invalidate(ucr3, kcr3);
 		}
 	}
 
 	PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
 	PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
 	if (cached)
 		PCPU_INC(pm_save_cnt);
 }
 
 static void
 pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
 {
 
 	pmap_activate_sw_pcid_pti(pmap, cpuid, true);
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 static void
 pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap,
     u_int cpuid)
 {
 	register_t rflags;
 
 	/*
 	 * If the INVPCID instruction is not available,
 	 * invltlb_pcid_handler() is used to handle an invalidate_all
 	 * IPI, which checks for curpmap == smp_tlb_pmap.  The below
 	 * sequence of operations has a window where %CR3 is loaded
 	 * with the new pmap's PML4 address, but the curpmap value has
 	 * not yet been updated.  This causes the invltlb IPI handler,
 	 * which is called between the updates, to execute as a NOP,
 	 * which leaves stale TLB entries.
 	 *
 	 * Note that the most typical use of pmap_activate_sw(), from
 	 * the context switch, is immune to this race, because
 	 * interrupts are disabled (while the thread lock is owned),
 	 * and the IPI happens after curpmap is updated.  Protect
 	 * other callers in a similar way, by disabling interrupts
 	 * around the %cr3 register reload and curpmap assignment.
 	 */
 	rflags = intr_disable();
 	pmap_activate_sw_pcid_pti(pmap, cpuid, false);
 	intr_restore(rflags);
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 static void
 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid)
 {
 	uint64_t cached, cr3;
 
 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
 	cr3 = rcr3();
 	if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
 		load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
 		    cached);
 	PCPU_SET(curpmap, pmap);
 	if (cached)
 		PCPU_INC(pm_save_cnt);
 }
 
 static void
 pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid)
 {
 	register_t rflags;
 
 	rflags = intr_disable();
 	pmap_activate_sw_pcid_nopti(td, pmap, cpuid);
 	intr_restore(rflags);
 }
 
 static void
 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid __unused)
 {
 
 	load_cr3(pmap->pm_cr3);
 	PCPU_SET(curpmap, pmap);
 }
 
 static void
 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap,
     u_int cpuid __unused)
 {
 
 	pmap_activate_sw_nopcid_nopti(td, pmap, cpuid);
 	PCPU_SET(kcr3, pmap->pm_cr3);
 	PCPU_SET(ucr3, pmap->pm_ucr3);
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t,
     u_int))
 {
 
 	if (pmap_pcid_enabled && pti && invpcid_works)
 		return (pmap_activate_sw_pcid_invpcid_pti);
 	else if (pmap_pcid_enabled && pti && !invpcid_works)
 		return (pmap_activate_sw_pcid_noinvpcid_pti);
 	else if (pmap_pcid_enabled && !pti && invpcid_works)
 		return (pmap_activate_sw_pcid_nopti);
 	else if (pmap_pcid_enabled && !pti && !invpcid_works)
 		return (pmap_activate_sw_pcid_noinvpcid_nopti);
 	else if (!pmap_pcid_enabled && pti)
 		return (pmap_activate_sw_nopcid_pti);
 	else /* if (!pmap_pcid_enabled && !pti) */
 		return (pmap_activate_sw_nopcid_nopti);
 }
 
 void
 pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
 	u_int cpuid;
 
 	oldpmap = PCPU_GET(curpmap);
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	if (oldpmap == pmap)
 		return;
 	cpuid = PCPU_GET(cpuid);
 #ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 	pmap_activate_sw_mode(td, pmap, cpuid);
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 #endif
 }
 
 void
 pmap_activate(struct thread *td)
 {
 
 	critical_enter();
 	pmap_activate_sw(td);
 	critical_exit();
 }
 
 void
 pmap_activate_boot(pmap_t pmap)
 {
 	uint64_t kcr3;
 	u_int cpuid;
 
 	/*
 	 * kernel_pmap must be never deactivated, and we ensure that
 	 * by never activating it at all.
 	 */
 	MPASS(pmap != kernel_pmap);
 
 	cpuid = PCPU_GET(cpuid);
 #ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 	PCPU_SET(curpmap, pmap);
 	if (pti) {
 		kcr3 = pmap->pm_cr3;
 		if (pmap_pcid_enabled)
 			kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE;
 	} else {
 		kcr3 = PMAP_NO_CR3;
 	}
 	PCPU_SET(kcr3, kcr3);
 	PCPU_SET(ucr3, PMAP_NO_CR3);
 }
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < NBPDR)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & PDRMASK;
 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 	    (*addr & PDRMASK) == superpage_offset)
 		return;
 	if ((*addr & PDRMASK) < superpage_offset)
 		*addr = (*addr & ~PDRMASK) + superpage_offset;
 	else
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
 #ifdef INVARIANTS
 static unsigned long num_dirty_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
 	     &num_dirty_emulations, 0, NULL);
 
 static unsigned long num_accessed_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
 	     &num_accessed_emulations, 0, NULL);
 
 static unsigned long num_superpage_accessed_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
 	     &num_superpage_accessed_emulations, 0, NULL);
 
 static unsigned long ad_emulation_superpage_promotions;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
 	     &ad_emulation_superpage_promotions, 0, NULL);
 #endif	/* INVARIANTS */
 
 int
 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
 {
 	int rv;
 	struct rwlock *lock;
 #if VM_NRESERVLEVEL > 0
 	vm_page_t m, mpte;
 #endif
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
 
 	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
 	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
 
 	if (!pmap_emulate_ad_bits(pmap))
 		return (-1);
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	rv = -1;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		goto done;
 
 	if ((*pde & PG_PS) != 0) {
 		if (ftype == VM_PROT_READ) {
 #ifdef INVARIANTS
 			atomic_add_long(&num_superpage_accessed_emulations, 1);
 #endif
 			*pde |= PG_A;
 			rv = 0;
 		}
 		goto done;
 	}
 
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		goto done;
 
 	if (ftype == VM_PROT_WRITE) {
 		if ((*pte & PG_RW) == 0)
 			goto done;
 		/*
 		 * Set the modified and accessed bits simultaneously.
 		 *
 		 * Intel EPT PTEs that do software emulation of A/D bits map
 		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
 		 * An EPT misconfiguration is triggered if the PTE is writable
 		 * but not readable (WR=10). This is avoided by setting PG_A
 		 * and PG_M simultaneously.
 		 */
 		*pte |= PG_M | PG_A;
 	} else {
 		*pte |= PG_A;
 	}
 
 #if VM_NRESERVLEVEL > 0
 	/* try to promote the mapping */
 	if (va < VM_MAXUSER_ADDRESS)
 		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	else
 		mpte = NULL;
 
 	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0) {
 		pmap_promote_pde(pmap, pde, va, &lock);
 #ifdef INVARIANTS
 		atomic_add_long(&ad_emulation_superpage_promotions, 1);
 #endif
 	}
 #endif
 
 #ifdef INVARIANTS
 	if (ftype == VM_PROT_WRITE)
 		atomic_add_long(&num_dirty_emulations, 1);
 	else
 		atomic_add_long(&num_accessed_emulations, 1);
 #endif
 	rv = 0;		/* success */
 done:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 void
 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
 {
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	int idx;
 
 	idx = 0;
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 
 	pml4 = pmap_pml4e(pmap, va);
 	ptr[idx++] = *pml4;
 	if ((*pml4 & PG_V) == 0)
 		goto done;
 
 	pdp = pmap_pml4e_to_pdpe(pml4, va);
 	ptr[idx++] = *pdp;
 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
 		goto done;
 
 	pde = pmap_pdpe_to_pde(pdp, va);
 	ptr[idx++] = *pde;
 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
 		goto done;
 
 	pte = pmap_pde_to_pte(pde, va);
 	ptr[idx++] = *pte;
 
 done:
 	PMAP_UNLOCK(pmap);
 	*num = idx;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	pt_entry_t *pte;
 	int cache_bits, error __unused, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(paddr >= dmaplimit)) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	/*
 	 * NB:  The sequence of updating a page table followed by accesses
 	 * to the corresponding pages used in the !DMAP case is subject to
 	 * the situation described in the "AMD64 Architecture Programmer's
 	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
 	 * Coherency Considerations".  Therefore, issuing the INVLPG right
 	 * after modifying the PTE bits is crucial.
 	 */
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= dmaplimit) {
 			if (can_fault) {
 				/*
 				 * Slow path, since we can get page faults
 				 * while mappings are active don't pin the
 				 * thread to the CPU and instead add a global
 				 * mapping visible to all CPUs.
 				 */
 				pmap_qenter(vaddr[i], &page[i], 1);
 			} else {
 				pte = vtopte(vaddr[i]);
 				cache_bits = pmap_cache_bits(kernel_pmap,
 				    page[i]->md.pat_mode, 0);
 				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
 				    cache_bits);
 				invlpg(vaddr[i]);
 			}
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= dmaplimit) {
 			if (can_fault)
 				pmap_qremove(vaddr[i], 1);
 			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
 		}
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 	vm_paddr_t paddr;
 
 	paddr = VM_PAGE_TO_PHYS(m);
 	if (paddr < dmaplimit)
 		return (PHYS_TO_DMAP(paddr));
 	mtx_lock_spin(&qframe_mtx);
 	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
 	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
 	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
 	return (qframe);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 
 	if (addr != qframe)
 		return;
 	pte_store(vtopte(qframe), 0);
 	invlpg(qframe);
 	mtx_unlock_spin(&qframe_mtx);
 }
 
 /*
  * Pdp pages from the large map are managed differently from either
  * kernel or user page table pages.  They are permanently allocated at
  * initialization time, and their wire count is permanently set to
  * zero.  The pml4 entries pointing to those pages are copied into
  * each allocated pmap.
  *
  * In contrast, pd and pt pages are managed like user page table
  * pages.  They are dynamically allocated, and their wire count
  * represents the number of valid entries within the page.
  */
 static vm_page_t
 pmap_large_map_getptp_unlocked(void)
 {
 	vm_page_t m;
 
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_ZERO);
 	if (m != NULL && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
 }
 
 static vm_page_t
 pmap_large_map_getptp(void)
 {
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	m = pmap_large_map_getptp_unlocked();
 	if (m == NULL) {
 		PMAP_UNLOCK(kernel_pmap);
 		vm_wait(NULL);
 		PMAP_LOCK(kernel_pmap);
 		/* Callers retry. */
 	}
 	return (m);
 }
 
 static pdp_entry_t *
 pmap_large_map_pdpe(vm_offset_t va)
 {
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
 	pml4_idx = pmap_pml4e_index(va);
 	KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
 	    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
 	    "%#jx lm_ents %d",
 	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
 	KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
 	    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
 	    "LMSPML4I %#jx lm_ents %d",
 	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
 	mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
 	return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
 }
 
 static pd_entry_t *
 pmap_large_map_pde(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 retry:
 	pdpe = pmap_large_map_pdpe(va);
 	if (*pdpe == 0) {
 		m = pmap_large_map_getptp();
 		if (m == NULL)
 			goto retry;
 		mphys = VM_PAGE_TO_PHYS(m);
 		*pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
 	} else {
 		MPASS((*pdpe & X86_PG_PS) == 0);
 		mphys = *pdpe & PG_FRAME;
 	}
 	return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
 }
 
 static pt_entry_t *
 pmap_large_map_pte(vm_offset_t va)
 {
 	pd_entry_t *pde;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 retry:
 	pde = pmap_large_map_pde(va);
 	if (*pde == 0) {
 		m = pmap_large_map_getptp();
 		if (m == NULL)
 			goto retry;
 		mphys = VM_PAGE_TO_PHYS(m);
 		*pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
 		PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++;
 	} else {
 		MPASS((*pde & X86_PG_PS) == 0);
 		mphys = *pde & PG_FRAME;
 	}
 	return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
 }
 
 static vm_paddr_t
 pmap_large_map_kextract(vm_offset_t va)
 {
 	pdp_entry_t *pdpe, pdp;
 	pd_entry_t *pde, pd;
 	pt_entry_t *pte, pt;
 
 	KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va),
 	    ("not largemap range %#lx", (u_long)va));
 	pdpe = pmap_large_map_pdpe(va);
 	pdp = *pdpe;
 	KASSERT((pdp & X86_PG_V) != 0,
 	    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
 	    (u_long)pdpe, pdp));
 	if ((pdp & X86_PG_PS) != 0) {
 		KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
 		    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
 		    (u_long)pdpe, pdp));
 		return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK));
 	}
 	pde = pmap_pdpe_to_pde(pdpe, va);
 	pd = *pde;
 	KASSERT((pd & X86_PG_V) != 0,
 	    ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd));
 	if ((pd & X86_PG_PS) != 0)
 		return ((pd & PG_PS_FRAME) | (va & PDRMASK));
 	pte = pmap_pde_to_pte(pde, va);
 	pt = *pte;
 	KASSERT((pt & X86_PG_V) != 0,
 	    ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt));
 	return ((pt & PG_FRAME) | (va & PAGE_MASK));
 }
 
 static int
 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
     vmem_addr_t *vmem_res)
 {
 
 	/*
 	 * Large mappings are all but static.  Consequently, there
 	 * is no point in waiting for an earlier allocation to be
 	 * freed.
 	 */
 	return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
 	    VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
 }
 
 int
 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
     vm_memattr_t mattr)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_offset_t va, inc;
 	vmem_addr_t vmem_res;
 	vm_paddr_t pa;
 	int error;
 
 	if (len == 0 || spa + len < spa)
 		return (EINVAL);
 
 	/* See if DMAP can serve. */
 	if (spa + len <= dmaplimit) {
 		va = PHYS_TO_DMAP(spa);
 		*addr = (void *)va;
 		return (pmap_change_attr(va, len, mattr));
 	}
 
 	/*
 	 * No, allocate KVA.  Fit the address with best possible
 	 * alignment for superpages.  Fall back to worse align if
 	 * failed.
 	 */
 	error = ENOMEM;
 	if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
 	    NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
 		error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
 		    &vmem_res);
 	if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
 	    NBPDR) + NBPDR)
 		error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
 		    &vmem_res);
 	if (error != 0)
 		error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Fill pagetable.  PG_M is not pre-set, we scan modified bits
 	 * in the pagetable to minimize flushing.  No need to
 	 * invalidate TLB, since we only update invalid entries.
 	 */
 	PMAP_LOCK(kernel_pmap);
 	for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
 	    len -= inc) {
 		if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
 		    (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
 			pdpe = pmap_large_map_pdpe(va);
 			MPASS(*pdpe == 0);
 			*pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
 			    X86_PG_V | X86_PG_A | pg_nx |
 			    pmap_cache_bits(kernel_pmap, mattr, TRUE);
 			inc = NBPDP;
 		} else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
 		    (va & PDRMASK) == 0) {
 			pde = pmap_large_map_pde(va);
 			MPASS(*pde == 0);
 			*pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
 			    X86_PG_V | X86_PG_A | pg_nx |
 			    pmap_cache_bits(kernel_pmap, mattr, TRUE);
 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
 			    wire_count++;
 			inc = NBPDR;
 		} else {
 			pte = pmap_large_map_pte(va);
 			MPASS(*pte == 0);
 			*pte = pa | pg_g | X86_PG_RW | X86_PG_V |
 			    X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
 			    mattr, FALSE);
 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
 			    wire_count++;
 			inc = PAGE_SIZE;
 		}
 	}
 	PMAP_UNLOCK(kernel_pmap);
 	MPASS(len == 0);
 
 	*addr = (void *)vmem_res;
 	return (0);
 }
 
 void
 pmap_large_unmap(void *svaa, vm_size_t len)
 {
 	vm_offset_t sva, va;
 	vm_size_t inc;
 	pdp_entry_t *pdpe, pdp;
 	pd_entry_t *pde, pd;
 	pt_entry_t *pte;
 	vm_page_t m;
 	struct spglist spgf;
 
 	sva = (vm_offset_t)svaa;
 	if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
 	    sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
 		return;
 
 	SLIST_INIT(&spgf);
 	KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) &&
 	    PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1),
 	    ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
 	PMAP_LOCK(kernel_pmap);
 	for (va = sva; va < sva + len; va += inc) {
 		pdpe = pmap_large_map_pdpe(va);
 		pdp = *pdpe;
 		KASSERT((pdp & X86_PG_V) != 0,
 		    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
 		    (u_long)pdpe, pdp));
 		if ((pdp & X86_PG_PS) != 0) {
 			KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
 			    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
 			    (u_long)pdpe, pdp));
 			KASSERT((va & PDPMASK) == 0,
 			    ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
 			    (u_long)pdpe, pdp));
 			KASSERT(va + NBPDP <= sva + len,
 			    ("unmap covers partial 1GB page, sva %#lx va %#lx "
 			    "pdpe %#lx pdp %#lx len %#lx", sva, va,
 			    (u_long)pdpe, pdp, len));
 			*pdpe = 0;
 			inc = NBPDP;
 			continue;
 		}
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		pd = *pde;
 		KASSERT((pd & X86_PG_V) != 0,
 		    ("invalid pd va %#lx pde %#lx pd %#lx", va,
 		    (u_long)pde, pd));
 		if ((pd & X86_PG_PS) != 0) {
 			KASSERT((va & PDRMASK) == 0,
 			    ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
 			    (u_long)pde, pd));
 			KASSERT(va + NBPDR <= sva + len,
 			    ("unmap covers partial 2MB page, sva %#lx va %#lx "
 			    "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde,
 			    pd, len));
 			pde_store(pde, 0);
 			inc = NBPDR;
 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
 			m->wire_count--;
 			if (m->wire_count == 0) {
 				*pdpe = 0;
 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			}
 			continue;
 		}
 		pte = pmap_pde_to_pte(pde, va);
 		KASSERT((*pte & X86_PG_V) != 0,
 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
 		    (u_long)pte, *pte));
 		pte_clear(pte);
 		inc = PAGE_SIZE;
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
 		m->wire_count--;
 		if (m->wire_count == 0) {
 			*pde = 0;
 			SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
 			m->wire_count--;
 			if (m->wire_count == 0) {
 				*pdpe = 0;
 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			}
 		}
 	}
 	pmap_invalidate_range(kernel_pmap, sva, sva + len);
 	PMAP_UNLOCK(kernel_pmap);
 	vm_page_free_pages_toq(&spgf, false);
 	vmem_free(large_vmem, sva, len);
 }
 
 static void
 pmap_large_map_wb_fence_mfence(void)
 {
 
 	mfence();
 }
 
 static void
 pmap_large_map_wb_fence_sfence(void)
 {
 
 	sfence();
 }
 
 static void
 pmap_large_map_wb_fence_nop(void)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void))
 {
 
 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
 		return (pmap_large_map_wb_fence_mfence);
 	else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
 	    CPUID_STDEXT_CLFLUSHOPT)) == 0)
 		return (pmap_large_map_wb_fence_sfence);
 	else
 		/* clflush is strongly enough ordered */
 		return (pmap_large_map_wb_fence_nop);
 }
 
 static void
 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clwb(va);
 }
 
 static void
 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clflushopt(va);
 }
 
 static void
 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clflush(va);
 }
 
 static void
 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t))
 {
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
 		return (pmap_large_map_flush_range_clwb);
 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
 		return (pmap_large_map_flush_range_clflushopt);
 	else if ((cpu_feature & CPUID_CLFSH) != 0)
 		return (pmap_large_map_flush_range_clflush);
 	else
 		return (pmap_large_map_flush_range_nop);
 }
 
 static void
 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
 {
 	volatile u_long *pe;
 	u_long p;
 	vm_offset_t va;
 	vm_size_t inc;
 	bool seen_other;
 
 	for (va = sva; va < eva; va += inc) {
 		inc = 0;
 		if ((amd_feature & AMDID_PAGE1GB) != 0) {
 			pe = (volatile u_long *)pmap_large_map_pdpe(va);
 			p = *pe;
 			if ((p & X86_PG_PS) != 0)
 				inc = NBPDP;
 		}
 		if (inc == 0) {
 			pe = (volatile u_long *)pmap_large_map_pde(va);
 			p = *pe;
 			if ((p & X86_PG_PS) != 0)
 				inc = NBPDR;
 		}
 		if (inc == 0) {
 			pe = (volatile u_long *)pmap_large_map_pte(va);
 			p = *pe;
 			inc = PAGE_SIZE;
 		}
 		seen_other = false;
 		for (;;) {
 			if ((p & X86_PG_AVAIL1) != 0) {
 				/*
 				 * Spin-wait for the end of a parallel
 				 * write-back.
 				 */
 				cpu_spinwait();
 				p = *pe;
 
 				/*
 				 * If we saw other write-back
 				 * occuring, we cannot rely on PG_M to
 				 * indicate state of the cache.  The
 				 * PG_M bit is cleared before the
 				 * flush to avoid ignoring new writes,
 				 * and writes which are relevant for
 				 * us might happen after.
 				 */
 				seen_other = true;
 				continue;
 			}
 
 			if ((p & X86_PG_M) != 0 || seen_other) {
 				if (!atomic_fcmpset_long(pe, &p,
 				    (p & ~X86_PG_M) | X86_PG_AVAIL1))
 					/*
 					 * If we saw PG_M without
 					 * PG_AVAIL1, and then on the
 					 * next attempt we do not
 					 * observe either PG_M or
 					 * PG_AVAIL1, the other
 					 * write-back started after us
 					 * and finished before us.  We
 					 * can rely on it doing our
 					 * work.
 					 */
 					continue;
 				pmap_large_map_flush_range(va, inc);
 				atomic_clear_long(pe, X86_PG_AVAIL1);
 			}
 			break;
 		}
 		maybe_yield();
 	}
 }
 
 /*
  * Write-back cache lines for the given address range.
  *
  * Must be called only on the range or sub-range returned from
  * pmap_large_map().  Must not be called on the coalesced ranges.
  *
  * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
  * instructions support.
  */
 void
 pmap_large_map_wb(void *svap, vm_size_t len)
 {
 	vm_offset_t eva, sva;
 
 	sva = (vm_offset_t)svap;
 	eva = sva + len;
 	pmap_large_map_wb_fence();
 	if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
 		pmap_large_map_flush_range(sva, len);
 	} else {
 		KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
 		    eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
 		    ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
 		pmap_large_map_wb_large(sva, eva);
 	}
 	pmap_large_map_wb_fence();
 }
 
 static vm_page_t
 pmap_pti_alloc_page(void)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	return (m);
 }
 
 static bool
 pmap_pti_free_page(vm_page_t m)
 {
 
 	KASSERT(m->wire_count > 0, ("page %p not wired", m));
 	if (!vm_page_unwire_noq(m))
 		return (false);
 	vm_page_free_zero(m);
 	return (true);
 }
 
 static void
 pmap_pti_init(void)
 {
 	vm_page_t pml4_pg;
 	pdp_entry_t *pdpe;
 	vm_offset_t va;
 	int i;
 
 	if (!pti)
 		return;
 	pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
 	VM_OBJECT_WLOCK(pti_obj);
 	pml4_pg = pmap_pti_alloc_page();
 	pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
 	for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
 	    va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
 		pdpe = pmap_pti_pdpe(va);
 		pmap_pti_wire_pte(pdpe);
 	}
 	pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
 	    (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
 	pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
 	    sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
 	pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
 	    sizeof(struct gate_descriptor) * NIDT, false);
 	pmap_pti_add_kva_locked((vm_offset_t)common_tss,
 	    (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
 	CPU_FOREACH(i) {
 		/* Doublefault stack IST 1 */
 		va = common_tss[i].tss_ist1;
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 		/* NMI stack IST 2 */
 		va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 		/* MC# stack IST 3 */
 		va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 		/* DB# stack IST 4 */
 		va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 	}
 	pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
 	    (vm_offset_t)etext, true);
 	pti_finalized = true;
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
 
 static pdp_entry_t *
 pmap_pti_pdpe(vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	vm_page_t m;
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pml4_idx = pmap_pml4e_index(va);
 	pml4e = &pti_pml4[pml4_idx];
 	m = NULL;
 	if (*pml4e == 0) {
 		if (pti_finalized)
 			panic("pml4 alloc after finalization\n");
 		m = pmap_pti_alloc_page();
 		if (*pml4e != 0) {
 			pmap_pti_free_page(m);
 			mphys = *pml4e & ~PAGE_MASK;
 		} else {
 			mphys = VM_PAGE_TO_PHYS(m);
 			*pml4e = mphys | X86_PG_RW | X86_PG_V;
 		}
 	} else {
 		mphys = *pml4e & ~PAGE_MASK;
 	}
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
 	return (pdpe);
 }
 
 static void
 pmap_pti_wire_pte(void *pte)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 	m->wire_count++;
 }
 
 static void
 pmap_pti_unwire_pde(void *pde, bool only_ref)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
 	MPASS(m->wire_count > 0);
 	MPASS(only_ref || m->wire_count > 1);
 	pmap_pti_free_page(m);
 }
 
 static void
 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
 {
 	vm_page_t m;
 	pd_entry_t *pde;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 	MPASS(m->wire_count > 0);
 	if (pmap_pti_free_page(m)) {
 		pde = pmap_pti_pde(va);
 		MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
 		*pde = 0;
 		pmap_pti_unwire_pde(pde, false);
 	}
 }
 
 static pd_entry_t *
 pmap_pti_pde(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	vm_page_t m;
 	vm_pindex_t pd_idx;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pdpe = pmap_pti_pdpe(va);
 	if (*pdpe == 0) {
 		m = pmap_pti_alloc_page();
 		if (*pdpe != 0) {
 			pmap_pti_free_page(m);
 			MPASS((*pdpe & X86_PG_PS) == 0);
 			mphys = *pdpe & ~PAGE_MASK;
 		} else {
 			mphys =  VM_PAGE_TO_PHYS(m);
 			*pdpe = mphys | X86_PG_RW | X86_PG_V;
 		}
 	} else {
 		MPASS((*pdpe & X86_PG_PS) == 0);
 		mphys = *pdpe & ~PAGE_MASK;
 	}
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
 	pd_idx = pmap_pde_index(va);
 	pde += pd_idx;
 	return (pde);
 }
 
 static pt_entry_t *
 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pde = pmap_pti_pde(va);
 	if (unwire_pde != NULL) {
 		*unwire_pde = true;
 		pmap_pti_wire_pte(pde);
 	}
 	if (*pde == 0) {
 		m = pmap_pti_alloc_page();
 		if (*pde != 0) {
 			pmap_pti_free_page(m);
 			MPASS((*pde & X86_PG_PS) == 0);
 			mphys = *pde & ~(PAGE_MASK | pg_nx);
 		} else {
 			mphys = VM_PAGE_TO_PHYS(m);
 			*pde = mphys | X86_PG_RW | X86_PG_V;
 			if (unwire_pde != NULL)
 				*unwire_pde = false;
 		}
 	} else {
 		MPASS((*pde & X86_PG_PS) == 0);
 		mphys = *pde & ~(PAGE_MASK | pg_nx);
 	}
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
 	pte += pmap_pte_index(va);
 
 	return (pte);
 }
 
 static void
 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
 {
 	vm_paddr_t pa;
 	pd_entry_t *pde;
 	pt_entry_t *pte, ptev;
 	bool unwire_pde;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	sva = trunc_page(sva);
 	MPASS(sva > VM_MAXUSER_ADDRESS);
 	eva = round_page(eva);
 	MPASS(sva < eva);
 	for (; sva < eva; sva += PAGE_SIZE) {
 		pte = pmap_pti_pte(sva, &unwire_pde);
 		pa = pmap_kextract(sva);
 		ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
 		    (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
 		    VM_MEMATTR_DEFAULT, FALSE);
 		if (*pte == 0) {
 			pte_store(pte, ptev);
 			pmap_pti_wire_pte(pte);
 		} else {
 			KASSERT(!pti_finalized,
 			    ("pti overlap after fin %#lx %#lx %#lx",
 			    sva, *pte, ptev));
 			KASSERT(*pte == ptev,
 			    ("pti non-identical pte after fin %#lx %#lx %#lx",
 			    sva, *pte, ptev));
 		}
 		if (unwire_pde) {
 			pde = pmap_pti_pde(sva);
 			pmap_pti_unwire_pde(pde, true);
 		}
 	}
 }
 
 void
 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
 {
 
 	if (!pti)
 		return;
 	VM_OBJECT_WLOCK(pti_obj);
 	pmap_pti_add_kva_locked(sva, eva, exec);
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 
 void
 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 
 	if (!pti)
 		return;
 	sva = rounddown2(sva, PAGE_SIZE);
 	MPASS(sva > VM_MAXUSER_ADDRESS);
 	eva = roundup2(eva, PAGE_SIZE);
 	MPASS(sva < eva);
 	VM_OBJECT_WLOCK(pti_obj);
 	for (va = sva; va < eva; va += PAGE_SIZE) {
 		pte = pmap_pti_pte(va, NULL);
 		KASSERT((*pte & X86_PG_V) != 0,
 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
 		    (u_long)pte, *pte));
 		pte_clear(pte);
 		pmap_pti_unwire_pte(pte, va);
 	}
 	pmap_invalidate_range(kernel_pmap, sva, eva);
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 
 static void *
 pkru_dup_range(void *ctx __unused, void *data)
 {
 	struct pmap_pkru_range *node, *new_node;
 
 	new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
 	if (new_node == NULL)
 		return (NULL);
 	node = data;
 	memcpy(new_node, node, sizeof(*node));
 	return (new_node);
 }
 
 static void
 pkru_free_range(void *ctx __unused, void *node)
 {
 
 	uma_zfree(pmap_pkru_ranges_zone, node);
 }
 
 static int
 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
     int flags)
 {
 	struct pmap_pkru_range *ppr;
 	int error;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	if ((flags & AMD64_PKRU_EXCL) != 0 &&
 	    !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
 		return (EBUSY);
 	ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
 	if (ppr == NULL)
 		return (ENOMEM);
 	ppr->pkru_keyidx = keyidx;
 	ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
 	error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
 	if (error != 0)
 		uma_zfree(pmap_pkru_ranges_zone, ppr);
 	return (error);
 }
 
 static int
 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	return (rangeset_remove(&pmap->pm_pkru, sva, eva));
 }
 
 static void
 pmap_pkru_deassign_all(pmap_t pmap)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
 		rangeset_remove_all(&pmap->pm_pkru);
 }
 
 static bool
 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct pmap_pkru_range *ppr, *prev_ppr;
 	vm_offset_t va;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
 	    sva >= VM_MAXUSER_ADDRESS)
 		return (true);
 	MPASS(eva <= VM_MAXUSER_ADDRESS);
 	for (va = sva, prev_ppr = NULL; va < eva;) {
 		ppr = rangeset_lookup(&pmap->pm_pkru, va);
 		if ((ppr == NULL) ^ (prev_ppr == NULL))
 			return (false);
 		if (ppr == NULL) {
 			va += PAGE_SIZE;
 			continue;
 		}
 		if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
 			return (false);
 		va = ppr->pkru_rs_el.re_end;
 	}
 	return (true);
 }
 
 static pt_entry_t
 pmap_pkru_get(pmap_t pmap, vm_offset_t va)
 {
 	struct pmap_pkru_range *ppr;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
 	    va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	ppr = rangeset_lookup(&pmap->pm_pkru, va);
 	if (ppr != NULL)
 		return (X86_PG_PKU(ppr->pkru_keyidx));
 	return (0);
 }
 
 static bool
 pred_pkru_on_remove(void *ctx __unused, void *r)
 {
 	struct pmap_pkru_range *ppr;
 
 	ppr = r;
 	return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
 }
 
 static void
 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 		rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
 		    pred_pkru_on_remove);
 	}
 }
 
 static int
 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
 {
 
 	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
 	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
 	MPASS(dst_pmap->pm_type == PT_X86);
 	MPASS(src_pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	if (src_pmap->pm_pkru.rs_data_ctx == NULL)
 		return (0);
 	return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
 }
 
 static void
 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     u_int keyidx)
 {
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t newpde, ptpaddr, *pde;
 	pt_entry_t newpte, *ptep, pte;
 	vm_offset_t va, va_next;
 	bool changed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
 
 	for (changed = false, va = sva; va < eva; va = va_next) {
 		pml4e = pmap_pml4e(pmap, va);
 		if ((*pml4e & X86_PG_V) == 0) {
 			va_next = (va + NBPML4) & ~PML4MASK;
 			if (va_next < va)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, va);
 		if ((*pdpe & X86_PG_V) == 0) {
 			va_next = (va + NBPDP) & ~PDPMASK;
 			if (va_next < va)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (va + NBPDR) & ~PDRMASK;
 		if (va_next < va)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		ptpaddr = *pde;
 		if (ptpaddr == 0)
 			continue;
 
 		MPASS((ptpaddr & X86_PG_V) != 0);
 		if ((ptpaddr & PG_PS) != 0) {
 			if (va + NBPDR == va_next && eva >= va_next) {
 				newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
 				    X86_PG_PKU(keyidx);
 				if (newpde != ptpaddr) {
 					*pde = newpde;
 					changed = true;
 				}
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, va)) {
 				continue;
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
 		    ptep++, va += PAGE_SIZE) {
 			pte = *ptep;
 			if ((pte & X86_PG_V) == 0)
 				continue;
 			newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
 			if (newpte != pte) {
 				*ptep = newpte;
 				changed = true;
 			}
 		}
 	}
 	if (changed)
 		pmap_invalidate_range(pmap, sva, eva);
 }
 
 static int
 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     u_int keyidx, int flags)
 {
 
 	if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
 	    (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
 		return (EINVAL);
 	if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
 		return (EFAULT);
 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
 		return (ENOTSUP);
 	return (0);
 }
 
 int
 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
     int flags)
 {
 	int error;
 
 	sva = trunc_page(sva);
 	eva = round_page(eva);
 	error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
 	if (error != 0)
 		return (error);
 	for (;;) {
 		PMAP_LOCK(pmap);
 		error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
 		if (error == 0)
 			pmap_pkru_update_range(pmap, sva, eva, keyidx);
 		PMAP_UNLOCK(pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 int
 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	int error;
 
 	sva = trunc_page(sva);
 	eva = round_page(eva);
 	error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
 	if (error != 0)
 		return (error);
 	for (;;) {
 		PMAP_LOCK(pmap);
 		error = pmap_pkru_deassign(pmap, sva, eva);
 		if (error == 0)
 			pmap_pkru_update_range(pmap, sva, eva, 0);
 		PMAP_UNLOCK(pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 /*
  * Track a range of the kernel's virtual address space that is contiguous
  * in various mapping attributes.
  */
 struct pmap_kernel_map_range {
 	vm_offset_t sva;
 	pt_entry_t attrs;
 	int ptes;
 	int pdes;
 	int pdpes;
 };
 
 static void
 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
     vm_offset_t eva)
 {
 	const char *mode;
 	int i, pat_idx;
 
 	if (eva <= range->sva)
 		return;
 
 	pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true);
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		if (pat_index[i] == pat_idx)
 			break;
 
 	switch (i) {
 	case PAT_WRITE_BACK:
 		mode = "WB";
 		break;
 	case PAT_WRITE_THROUGH:
 		mode = "WT";
 		break;
 	case PAT_UNCACHEABLE:
 		mode = "UC";
 		break;
 	case PAT_WRITE_PROTECTED:
 		mode = "WP";
 		break;
 	case PAT_WRITE_COMBINING:
 		mode = "WC";
 		break;
 	default:
-		printf("%s: unknown PAT mode %#x for range %#016lx-%#016lx\n",
+		printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n",
 		    __func__, i, range->sva, eva);
 		mode = "??";
 		break;
 	}
 
-	sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n",
+	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
 	    range->sva, eva,
 	    (range->attrs & X86_PG_RW) != 0 ? 'w' : '-',
 	    (range->attrs & pg_nx) != 0 ? '-' : 'x',
 	    (range->attrs & X86_PG_U) != 0 ? 'u' : 's',
 	    (range->attrs & X86_PG_G) != 0 ? 'g' : '-',
 	    mode, range->pdpes, range->pdes, range->ptes);
 
 	/* Reset to sentinel value. */
 	range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
 }
 
 /*
  * Determine whether the attributes specified by a page table entry match those
  * being tracked by the current range.  This is not quite as simple as a direct
  * flag comparison since some PAT modes have multiple representations.
  */
 static bool
 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
 {
 	pt_entry_t diff, mask;
 
 	mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx;
 	diff = (range->attrs ^ attrs) & mask;
 	if (diff == 0)
 		return (true);
 	if ((diff & ~X86_PG_PDE_PAT) == 0 &&
 	    pmap_pat_index(kernel_pmap, range->attrs, true) ==
 	    pmap_pat_index(kernel_pmap, attrs, true))
 		return (true);
 	return (false);
 }
 
 static void
 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
     pt_entry_t attrs)
 {
 
 	memset(range, 0, sizeof(*range));
 	range->sva = va;
 	range->attrs = attrs;
 }
 
 /*
  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
  * those of the current run, dump the address range and its attributes, and
  * begin a new run.
  */
 static void
 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
     vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
     pt_entry_t pte)
 {
 	pt_entry_t attrs;
 
 	attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
 
 	attrs |= pdpe & pg_nx;
 	attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
 	if ((pdpe & PG_PS) != 0) {
 		attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE);
 	} else if (pde != 0) {
 		attrs |= pde & pg_nx;
 		attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U));
 	}
 	if ((pde & PG_PS) != 0) {
 		attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE);
 	} else if (pte != 0) {
 		attrs |= pte & pg_nx;
 		attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U));
 		attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE);
 
 		/* Canonicalize by always using the PDE PAT bit. */
 		if ((attrs & X86_PG_PTE_PAT) != 0)
 			attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT;
 	}
 
 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
 		sysctl_kmaps_dump(sb, range, va);
 		sysctl_kmaps_reinit(range, va, attrs);
 	}
 }
 
 static int
 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 {
 	struct pmap_kernel_map_range range;
 	struct sbuf sbuf, *sb;
 	pml4_entry_t pml4e;
 	pdp_entry_t *pdp, pdpe;
 	pd_entry_t *pd, pde;
 	pt_entry_t *pt, pte;
 	vm_offset_t sva;
 	vm_paddr_t pa;
 	int error, i, j, k, l;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = &sbuf;
 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
 
 	/* Sentinel value. */
 	range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
 
 	/*
 	 * Iterate over the kernel page tables without holding the kernel pmap
 	 * lock.  Outside of the large map, kernel page table pages are never
 	 * freed, so at worst we will observe inconsistencies in the output.
 	 * Within the large map, ensure that PDP and PD page addresses are
 	 * valid before descending.
 	 */
 	for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
 		switch (i) {
 		case PML4PML4I:
 			sbuf_printf(sb, "\nRecursive map:\n");
 			break;
 		case DMPML4I:
 			sbuf_printf(sb, "\nDirect map:\n");
 			break;
 		case KPML4BASE:
 			sbuf_printf(sb, "\nKernel map:\n");
 			break;
 		case LMSPML4I:
 			sbuf_printf(sb, "\nLarge map:\n");
 			break;
 		}
 
 		/* Convert to canonical form. */
 		if (sva == 1ul << 47)
 			sva |= -1ul << 48;
 
 restart:
 		pml4e = kernel_pmap->pm_pml4[i];
 		if ((pml4e & X86_PG_V) == 0) {
 			sva = rounddown2(sva, NBPML4);
 			sysctl_kmaps_dump(sb, &range, sva);
 			sva += NBPML4;
 			continue;
 		}
 		pa = pml4e & PG_FRAME;
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa);
 
 		for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) {
 			pdpe = pdp[j];
 			if ((pdpe & X86_PG_V) == 0) {
 				sva = rounddown2(sva, NBPDP);
 				sysctl_kmaps_dump(sb, &range, sva);
 				sva += NBPDP;
 				continue;
 			}
 			pa = pdpe & PG_FRAME;
 			if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
 			    vm_phys_paddr_to_vm_page(pa) == NULL)
 				goto restart;
 			if ((pdpe & PG_PS) != 0) {
 				sva = rounddown2(sva, NBPDP);
 				sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
 				    0, 0);
 				range.pdpes++;
 				sva += NBPDP;
 				continue;
 			}
 			pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
 
 			for (k = pmap_pde_index(sva); k < NPDEPG; k++) {
 				pde = pd[k];
 				if ((pde & X86_PG_V) == 0) {
 					sva = rounddown2(sva, NBPDR);
 					sysctl_kmaps_dump(sb, &range, sva);
 					sva += NBPDR;
 					continue;
 				}
 				pa = pde & PG_FRAME;
 				if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
 				    vm_phys_paddr_to_vm_page(pa) == NULL)
 					goto restart;
 				if ((pde & PG_PS) != 0) {
 					sva = rounddown2(sva, NBPDR);
 					sysctl_kmaps_check(sb, &range, sva,
 					    pml4e, pdpe, pde, 0);
 					range.pdes++;
 					sva += NBPDR;
 					continue;
 				}
 				pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
 
 				for (l = pmap_pte_index(sva); l < NPTEPG; l++,
 				    sva += PAGE_SIZE) {
 					pte = pt[l];
 					if ((pte & X86_PG_V) == 0) {
 						sysctl_kmaps_dump(sb, &range,
 						    sva);
 						continue;
 					}
 					sysctl_kmaps_check(sb, &range, sva,
 					    pml4e, pdpe, pde, pte);
 					range.ptes++;
 				}
 			}
 		}
 	}
 
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kmaps, "A",
     "Dump kernel address layout");
 
 #ifdef DDB
 DB_SHOW_COMMAND(pte, pmap_print_pte)
 {
 	pmap_t pmap;
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	vm_offset_t va;
 
 	if (!have_addr) {
 		db_printf("show pte addr\n");
 		return;
 	}
 	va = (vm_offset_t)addr;
 
 	if (kdb_thread != NULL)
 		pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
 	else
 		pmap = PCPU_GET(curpmap);
 
 	PG_V = pmap_valid_bit(pmap);
 	pml4 = pmap_pml4e(pmap, va);
-	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
+	db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4);
 	if ((*pml4 & PG_V) == 0) {
 		db_printf("\n");
 		return;
 	}
 	pdp = pmap_pml4e_to_pdpe(pml4, va);
-	db_printf(" pdpe %#016lx", *pdp);
+	db_printf(" pdpe 0x%016lx", *pdp);
 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pde = pmap_pdpe_to_pde(pdp, va);
-	db_printf(" pde %#016lx", *pde);
+	db_printf(" pde 0x%016lx", *pde);
 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pte = pmap_pde_to_pte(pde, va);
-	db_printf(" pte %#016lx\n", *pte);
+	db_printf(" pte 0x%016lx\n", *pte);
 }
 
 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
 {
 	vm_paddr_t a;
 
 	if (have_addr) {
 		a = (vm_paddr_t)addr;
 		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
 	} else {
 		db_printf("show phys2dmap addr\n");
 	}
 }
 #endif
Index: projects/clang900-import/sys/arm64/arm64/pmap.c
===================================================================
--- projects/clang900-import/sys/arm64/arm64/pmap.c	(revision 352586)
+++ projects/clang900-import/sys/arm64/arm64/pmap.c	(revision 352587)
@@ -1,5941 +1,5951 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  * Copyright (c) 2014 Andrew Turner
  * All rights reserved.
  * Copyright (c) 2014-2016 The FreeBSD Foundation
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * This software was developed by Andrew Turner under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/_unrhdr.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #include <machine/machdep.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 
 #include <arm/include/physmem.h>
 
 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
 
 #define	NUL0E		L0_ENTRIES
 #define	NUL1E		(NUL0E * NL1PG)
 #define	NUL2E		(NUL1E * NL2PG)
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 /*
  * These are configured by the mair_el1 register. This is set up in locore.S
  */
 #define	DEVICE_MEMORY	0
 #define	UNCACHED_MEMORY	1
 #define	CACHED_MEMORY	2
 
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
 #define	pa_to_pvh(pa)		(&pv_table[pmap_l2_pindex(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 /*
  * The presence of this flag indicates that the mapping is writeable.
  * If the ATTR_AP_RO bit is also set, then the mapping is clean, otherwise it is
  * dirty.  This flag may only be set on managed mappings.
  *
  * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
  * as a software managed bit.
  */
 #define	ATTR_SW_DBM	ATTR_DBM
 
 struct pmap kernel_pmap_store;
 
 /* Used for mapping ACPI memory before VM is initialized */
 #define	PMAP_PREINIT_MAPPING_COUNT	32
 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
 
 /*
  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
  * Always map entire L2 block for simplicity.
  * VA of L2 block = preinit_map_va + i * L2_SIZE
  */
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	size;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t kernel_vm_end = 0;
 
 /*
  * Data for the pv entry allocation mechanism.
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
 
 /* This code assumes all L1 DMAP entries will be used */
 CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
 CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
 
 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
 extern pt_entry_t pagetable_dmap[];
 
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 static vm_paddr_t physmap[PHYSMAP_SIZE];
 static u_int physmap_idx;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int superpages_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
     "Are large page mappings enabled?");
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
     vm_offset_t va, struct rwlock **lockp);
 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
     u_int flags, vm_page_t m, struct rwlock **lockp);
 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 
 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 
 /*
  * These load the old table data and store the new value.
  * They need to be atomic as the System MMU may write to the table at
  * the same time as the CPU.
  */
 #define	pmap_clear(table)		atomic_store_64(table, 0)
 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
 #define	pmap_load(table)		(*table)
 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
 
 /********************/
 /* Inline functions */
 /********************/
 
 static __inline void
 pagecopy(void *s, void *d)
 {
 
 	memcpy(d, s, PAGE_SIZE);
 }
 
 static __inline pd_entry_t *
 pmap_l0(pmap_t pmap, vm_offset_t va)
 {
 
 	return (&pmap->pm_l0[pmap_l0_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
 {
 	pd_entry_t *l1;
 
 	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
 	return (&l1[pmap_l1_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l1(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l0;
 
 	l0 = pmap_l0(pmap, va);
 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
 		return (NULL);
 
 	return (pmap_l0_to_l1(l0, va));
 }
 
 static __inline pd_entry_t *
 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
 {
 	pd_entry_t *l2;
 
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
 	return (&l2[pmap_l2_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l2(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l1;
 
 	l1 = pmap_l1(pmap, va);
 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
 		return (NULL);
 
 	return (pmap_l1_to_l2(l1, va));
 }
 
 static __inline pt_entry_t *
 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
 {
 	pt_entry_t *l3;
 
 	l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
 	return (&l3[pmap_l3_index(va)]);
 }
 
 /*
  * Returns the lowest valid pde for a given virtual address.
  * The next level may or may not point to a valid page or block.
  */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
 {
 	pd_entry_t *l0, *l1, *l2, desc;
 
 	l0 = pmap_l0(pmap, va);
 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
 	if (desc != L0_TABLE) {
 		*level = -1;
 		return (NULL);
 	}
 
 	l1 = pmap_l0_to_l1(l0, va);
 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
 	if (desc != L1_TABLE) {
 		*level = 0;
 		return (l0);
 	}
 
 	l2 = pmap_l1_to_l2(l1, va);
 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
 	if (desc != L2_TABLE) {
 		*level = 1;
 		return (l1);
 	}
 
 	*level = 2;
 	return (l2);
 }
 
 /*
  * Returns the lowest valid pte block or table entry for a given virtual
  * address. If there are no valid entries return NULL and set the level to
  * the first invalid level.
  */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
 {
 	pd_entry_t *l1, *l2, desc;
 	pt_entry_t *l3;
 
 	l1 = pmap_l1(pmap, va);
 	if (l1 == NULL) {
 		*level = 0;
 		return (NULL);
 	}
 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
 	if (desc == L1_BLOCK) {
 		*level = 1;
 		return (l1);
 	}
 
 	if (desc != L1_TABLE) {
 		*level = 1;
 		return (NULL);
 	}
 
 	l2 = pmap_l1_to_l2(l1, va);
 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
 	if (desc == L2_BLOCK) {
 		*level = 2;
 		return (l2);
 	}
 
 	if (desc != L2_TABLE) {
 		*level = 2;
 		return (NULL);
 	}
 
 	*level = 3;
 	l3 = pmap_l2_to_l3(l2, va);
 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
 		return (NULL);
 
 	return (l3);
 }
 
 bool
 pmap_ps_enabled(pmap_t pmap __unused)
 {
 
 	return (superpages_enabled != 0);
 }
 
 bool
 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
     pd_entry_t **l2, pt_entry_t **l3)
 {
 	pd_entry_t *l0p, *l1p, *l2p;
 
 	if (pmap->pm_l0 == NULL)
 		return (false);
 
 	l0p = pmap_l0(pmap, va);
 	*l0 = l0p;
 
 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
 		return (false);
 
 	l1p = pmap_l0_to_l1(l0p, va);
 	*l1 = l1p;
 
 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
 		*l2 = NULL;
 		*l3 = NULL;
 		return (true);
 	}
 
 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
 		return (false);
 
 	l2p = pmap_l1_to_l2(l1p, va);
 	*l2 = l2p;
 
 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
 		*l3 = NULL;
 		return (true);
 	}
 
 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
 		return (false);
 
 	*l3 = pmap_l2_to_l3(l2p, va);
 
 	return (true);
 }
 
 static __inline int
 pmap_l3_valid(pt_entry_t l3)
 {
 
 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
 }
 
 
 CTASSERT(L1_BLOCK == L2_BLOCK);
 
 /*
  * Checks if the PTE is dirty.
  */
 static inline int
 pmap_pte_dirty(pt_entry_t pte)
 {
 
 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
 	KASSERT((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != 0,
 	    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
 
 	return ((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 	    (ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM));
 }
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 static pt_entry_t *
 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
     u_int *l2_slot)
 {
 	pt_entry_t *l2;
 	pd_entry_t *l1;
 
 	l1 = (pd_entry_t *)l1pt;
 	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
 
 	/* Check locore has used a table L1 map */
 	KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
 	   ("Invalid bootstrap L1 table"));
 	/* Find the address of the L2 table */
 	l2 = (pt_entry_t *)init_pt_va;
 	*l2_slot = pmap_l2_index(va);
 
 	return (l2);
 }
 
 static vm_paddr_t
 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
 {
 	u_int l1_slot, l2_slot;
 	pt_entry_t *l2;
 
 	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
 
 	return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
 }
 
 static vm_offset_t
 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
     vm_offset_t freemempos)
 {
 	pt_entry_t *l2;
 	vm_offset_t va;
 	vm_paddr_t l2_pa, pa;
 	u_int l1_slot, l2_slot, prev_l1_slot;
 	int i;
 
 	dmap_phys_base = min_pa & ~L1_OFFSET;
 	dmap_phys_max = 0;
 	dmap_max_addr = 0;
 	l2 = NULL;
 	prev_l1_slot = -1;
 
 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
 	memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES);
 
 	for (i = 0; i < (physmap_idx * 2); i += 2) {
 		pa = physmap[i] & ~L2_OFFSET;
 		va = pa - dmap_phys_base + DMAP_MIN_ADDRESS;
 
 		/* Create L2 mappings at the start of the region */
 		if ((pa & L1_OFFSET) != 0) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			if (l1_slot != prev_l1_slot) {
 				prev_l1_slot = l1_slot;
 				l2 = (pt_entry_t *)freemempos;
 				l2_pa = pmap_early_vtophys(kern_l1,
 				    (vm_offset_t)l2);
 				freemempos += PAGE_SIZE;
 
 				pmap_store(&pagetable_dmap[l1_slot],
 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
 
 				memset(l2, 0, PAGE_SIZE);
 			}
 			KASSERT(l2 != NULL,
 			    ("pmap_bootstrap_dmap: NULL l2 map"));
 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
 			    pa += L2_SIZE, va += L2_SIZE) {
 				/*
 				 * We are on a boundary, stop to
 				 * create a level 1 block
 				 */
 				if ((pa & L1_OFFSET) == 0)
 					break;
 
 				l2_slot = pmap_l2_index(va);
 				KASSERT(l2_slot != 0, ("..."));
 				pmap_store(&l2[l2_slot],
 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 				    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 			}
 			KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS),
 			    ("..."));
 		}
 
 		for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] &&
 		    (physmap[i + 1] - pa) >= L1_SIZE;
 		    pa += L1_SIZE, va += L1_SIZE) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			pmap_store(&pagetable_dmap[l1_slot],
 			    (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 			    ATTR_IDX(CACHED_MEMORY) | L1_BLOCK);
 		}
 
 		/* Create L2 mappings at the end of the region */
 		if (pa < physmap[i + 1]) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			if (l1_slot != prev_l1_slot) {
 				prev_l1_slot = l1_slot;
 				l2 = (pt_entry_t *)freemempos;
 				l2_pa = pmap_early_vtophys(kern_l1,
 				    (vm_offset_t)l2);
 				freemempos += PAGE_SIZE;
 
 				pmap_store(&pagetable_dmap[l1_slot],
 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
 
 				memset(l2, 0, PAGE_SIZE);
 			}
 			KASSERT(l2 != NULL,
 			    ("pmap_bootstrap_dmap: NULL l2 map"));
 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
 			    pa += L2_SIZE, va += L2_SIZE) {
 				l2_slot = pmap_l2_index(va);
 				pmap_store(&l2[l2_slot],
 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 				    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 			}
 		}
 
 		if (pa > dmap_phys_max) {
 			dmap_phys_max = pa;
 			dmap_max_addr = va;
 		}
 	}
 
 	cpu_tlb_flushID();
 
 	return (freemempos);
 }
 
 static vm_offset_t
 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
 {
 	vm_offset_t l2pt;
 	vm_paddr_t pa;
 	pd_entry_t *l1;
 	u_int l1_slot;
 
 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
 
 	l1 = (pd_entry_t *)l1pt;
 	l1_slot = pmap_l1_index(va);
 	l2pt = l2_start;
 
 	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
 
 		pa = pmap_early_vtophys(l1pt, l2pt);
 		pmap_store(&l1[l1_slot],
 		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
 		l2pt += PAGE_SIZE;
 	}
 
 	/* Clean the L2 page table */
 	memset((void *)l2_start, 0, l2pt - l2_start);
 
 	return l2pt;
 }
 
 static vm_offset_t
 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
 {
 	vm_offset_t l3pt;
 	vm_paddr_t pa;
 	pd_entry_t *l2;
 	u_int l2_slot;
 
 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
 
 	l2 = pmap_l2(kernel_pmap, va);
 	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
 	l2_slot = pmap_l2_index(va);
 	l3pt = l3_start;
 
 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
 
 		pa = pmap_early_vtophys(l1pt, l3pt);
 		pmap_store(&l2[l2_slot],
 		    (pa & ~Ln_TABLE_MASK) | L2_TABLE);
 		l3pt += PAGE_SIZE;
 	}
 
 	/* Clean the L2 page table */
 	memset((void *)l3_start, 0, l3pt - l3_start);
 
 	return l3pt;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
     vm_size_t kernlen)
 {
 	u_int l1_slot, l2_slot;
 	pt_entry_t *l2;
 	vm_offset_t va, freemempos;
 	vm_offset_t dpcpu, msgbufpv;
 	vm_paddr_t start_pa, pa, min_pa;
 	uint64_t kern_delta;
 	int i;
 
 	kern_delta = KERNBASE - kernstart;
 
 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
 	printf("%lx\n", l1pt);
 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
 
 	/* Set this early so we can use the pagetable walking functions */
 	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	/* Assume the address we were loaded to is a valid physical address */
 	min_pa = KERNBASE - kern_delta;
 
 	physmap_idx = arm_physmem_avail(physmap, nitems(physmap));
 	physmap_idx /= 2;
 
 	/*
 	 * Find the minimum physical address. physmap is sorted,
 	 * but may contain empty ranges.
 	 */
 	for (i = 0; i < (physmap_idx * 2); i += 2) {
 		if (physmap[i] == physmap[i + 1])
 			continue;
 		if (physmap[i] <= min_pa)
 			min_pa = physmap[i];
 	}
 
 	freemempos = KERNBASE + kernlen;
 	freemempos = roundup2(freemempos, PAGE_SIZE);
 
 	/* Create a direct map region early so we can use it for pa -> va */
 	freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos);
 
 	va = KERNBASE;
 	start_pa = pa = KERNBASE - kern_delta;
 
 	/*
 	 * Read the page table to find out what is already mapped.
 	 * This assumes we have mapped a block of memory from KERNBASE
 	 * using a single L1 entry.
 	 */
 	l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
 
 	/* Sanity check the index, KERNBASE should be the first VA */
 	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
 
 	/* Find how many pages we have mapped */
 	for (; l2_slot < Ln_ENTRIES; l2_slot++) {
 		if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
 			break;
 
 		/* Check locore used L2 blocks */
 		KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
 		    ("Invalid bootstrap L2 table"));
 		KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
 		    ("Incorrect PA in L2 table"));
 
 		va += L2_SIZE;
 		pa += L2_SIZE;
 	}
 
 	va = roundup2(va, L1_SIZE);
 
 	/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
 	freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
 	/* And the l3 tables for the early devmap */
 	freemempos = pmap_bootstrap_l3(l1pt,
 	    VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos);
 
 	cpu_tlb_flushID();
 
 #define alloc_pages(var, np)						\
 	(var) = freemempos;						\
 	freemempos += (np * PAGE_SIZE);					\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	/* Allocate dynamic per-cpu area. */
 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu, 0);
 
 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
 	msgbufp = (void *)msgbufpv;
 
 	/* Reserve some VA space for early BIOS/ACPI mapping */
 	preinit_map_va = roundup2(freemempos, L2_SIZE);
 
 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
 	kernel_vm_end = virtual_avail;
 
 	pa = pmap_early_vtophys(l1pt, freemempos);
 
 	arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
 
 	cpu_tlb_flushID();
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	vm_size_t s;
 	int i, pv_npg;
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
 	if (superpages_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = L2_SIZE;
 	}
 
 	/*
 	 * Initialize the pv chunk list mutex.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	vm_initialized = 1;
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
     "2MB page mapping counters");
 
 static u_long pmap_l2_demotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_l2_demotions, 0, "2MB page demotions");
 
 static u_long pmap_l2_mappings;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_l2_mappings, 0, "2MB page mappings");
 
 static u_long pmap_l2_p_failures;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_l2_p_failures, 0, "2MB page promotion failures");
 
 static u_long pmap_l2_promotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_l2_promotions, 0, "2MB page promotions");
 
 /*
  * Invalidate a single TLB entry.
  */
 static __inline void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	sched_pin();
 	__asm __volatile(
 	    "dsb  ishst		\n"
 	    "tlbi vaae1is, %0	\n"
 	    "dsb  ish		\n"
 	    "isb		\n"
 	    : : "r"(va >> PAGE_SHIFT));
 	sched_unpin();
 }
 
 static __inline void
 pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	dsb(ishst);
 	for (addr = sva; addr < eva; addr += PAGE_SIZE) {
 		__asm __volatile(
 		    "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
 	}
 	__asm __volatile(
 	    "dsb  ish	\n"
 	    "isb	\n");
 }
 
 static __inline void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	sched_pin();
 	pmap_invalidate_range_nopin(pmap, sva, eva);
 	sched_unpin();
 }
 
 static __inline void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	sched_pin();
 	__asm __volatile(
 	    "dsb  ishst		\n"
 	    "tlbi vmalle1is	\n"
 	    "dsb  ish		\n"
 	    "isb		\n");
 	sched_unpin();
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *pte, tpte;
 	vm_paddr_t pa;
 	int lvl;
 
 	pa = 0;
 	PMAP_LOCK(pmap);
 	/*
 	 * Find the block or page map for this virtual address. pmap_pte
 	 * will return either a valid block/page entry, or NULL.
 	 */
 	pte = pmap_pte(pmap, va, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 		pa = tpte & ~ATTR_MASK;
 		switch(lvl) {
 		case 1:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
 			    ("pmap_extract: Invalid L1 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L1_OFFSET);
 			break;
 		case 2:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
 			    ("pmap_extract: Invalid L2 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L2_OFFSET);
 			break;
 		case 3:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
 			    ("pmap_extract: Invalid L3 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L3_OFFSET);
 			break;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pt_entry_t *pte, tpte;
 	vm_offset_t off;
 	vm_page_t m;
 	int lvl;
 
 	m = NULL;
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, va, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 
 		KASSERT(lvl > 0 && lvl <= 3,
 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
 		CTASSERT(L1_BLOCK == L2_BLOCK);
 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
 		     tpte & ATTR_DESCR_MASK));
 		if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
 		    ((prot & VM_PROT_WRITE) == 0)) {
 			switch(lvl) {
 			case 1:
 				off = va & L1_OFFSET;
 				break;
 			case 2:
 				off = va & L2_OFFSET;
 				break;
 			case 3:
 			default:
 				off = 0;
 			}
 			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
 			if (!vm_page_wire_mapped(m))
 				m = NULL;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pt_entry_t *pte, tpte;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return (DMAP_TO_PHYS(va));
 	pte = pmap_l1(kernel_pmap, va);
 	if (pte == NULL)
 		return (0);
 
 	/*
 	 * A concurrent pmap_update_entry() will clear the entry's valid bit
 	 * but leave the rest of the entry unchanged.  Therefore, we treat a
 	 * non-zero entry as being valid, and we ignore the valid bit when
 	 * determining whether the entry maps a block, page, or table.
 	 */
 	tpte = pmap_load(pte);
 	if (tpte == 0)
 		return (0);
 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK)
 		return ((tpte & ~ATTR_MASK) | (va & L1_OFFSET));
 	pte = pmap_l1_to_l2(&tpte, va);
 	tpte = pmap_load(pte);
 	if (tpte == 0)
 		return (0);
 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK)
 		return ((tpte & ~ATTR_MASK) | (va & L2_OFFSET));
 	pte = pmap_l2_to_l3(&tpte, va);
 	tpte = pmap_load(pte);
 	if (tpte == 0)
 		return (0);
 	return ((tpte & ~ATTR_MASK) | (va & L3_OFFSET));
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 void
 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, attr;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT((pa & L3_OFFSET) == 0,
 	   ("pmap_kenter: Invalid physical address"));
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kenter: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kenter: Mapping is not page-sized"));
 
 	attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE;
 	if (mode == DEVICE_MEMORY)
 		attr |= ATTR_XN;
 
 	va = sva;
 	while (size != 0) {
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
 
 		pte = pmap_l2_to_l3(pde, va);
 		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
 
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 void
 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
 {
 
 	pmap_kenter(sva, size, pa, DEVICE_MEMORY);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 	int lvl;
 
 	pte = pmap_pte(kernel_pmap, va, &lvl);
 	KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
 	KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
 
 	pmap_clear(pte);
 	pmap_invalidate_page(kernel_pmap, va);
 }
 
 void
 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kremove_device: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kremove_device: Mapping is not page-sized"));
 
 	va = sva;
 	while (size != 0) {
 		pte = pmap_pte(kernel_pmap, va, &lvl);
 		KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
 		KASSERT(lvl == 3,
 		    ("Invalid device pagetable level: %d != 3", lvl));
 		pmap_clear(pte);
 
 		va += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, pa;
 	vm_offset_t va;
 	vm_page_t m;
 	int i, lvl;
 
 	va = sva;
 	for (i = 0; i < count; i++) {
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2,
 		    ("pmap_qenter: Invalid level %d", lvl));
 
 		m = ma[i];
 		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) |
 		    ATTR_IDX(m->md.pv_memattr) | L3_PAGE;
 		if (m->md.pv_memattr == DEVICE_MEMORY)
 			pa |= ATTR_XN;
 		pte = pmap_l2_to_l3(pde, va);
 		pmap_load_store(pte, pa);
 
 		va += L3_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
 
 	va = sva;
 	while (count-- > 0) {
 		pte = pmap_pte(kernel_pmap, va, &lvl);
 		KASSERT(lvl == 3,
 		    ("Invalid device pagetable level: %d != 3", lvl));
 		if (pte != NULL) {
 			pmap_clear(pte);
 		}
 
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_l3(pmap, va, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= (NUL2E + NUL1E)) {
 		/* l1 page */
 		pd_entry_t *l0;
 
 		l0 = pmap_l0(pmap, va);
 		pmap_clear(l0);
 	} else if (m->pindex >= NUL2E) {
 		/* l2 page */
 		pd_entry_t *l1;
 
 		l1 = pmap_l1(pmap, va);
 		pmap_clear(l1);
 	} else {
 		/* l3 page */
 		pd_entry_t *l2;
 
 		l2 = pmap_l2(pmap, va);
 		pmap_clear(l2);
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUL2E) {
 		/* We just released an l3, unhold the matching l2 */
 		pd_entry_t *l1, tl1;
 		vm_page_t l2pg;
 
 		l1 = pmap_l1(pmap, va);
 		tl1 = pmap_load(l1);
 		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
 		pmap_unwire_l3(pmap, va, l2pg, free);
 	} else if (m->pindex < (NUL2E + NUL1E)) {
 		/* We just released an l2, unhold the matching l1 */
 		pd_entry_t *l0, tl0;
 		vm_page_t l1pg;
 
 		l0 = pmap_l0(pmap, va);
 		tl0 = pmap_load(l0);
 		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
 		pmap_unwire_l3(pmap, va, l1pg, free);
 	}
 	pmap_invalidate_page(pmap, va);
 
 	/*
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
 	return (pmap_unwire_l3(pmap, va, mpte, free));
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 	pmap->pm_l0 = kernel_pmap->pm_l0;
 	pmap->pm_root.rt_root = 0;
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 	vm_paddr_t l0phys;
 	vm_page_t l0pt;
 
 	/*
 	 * allocate the l0 page
 	 */
 	while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		vm_wait(NULL);
 
 	l0phys = VM_PAGE_TO_PHYS(l0pt);
 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);
 
 	if ((l0pt->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_l0);
 
 	pmap->pm_root.rt_root = 0;
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 
 	return (1);
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, l1pg, l2pg;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			vm_wait(NULL);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Because of AArch64's weak memory consistency model, we must have a
 	 * barrier here to ensure that the stores for zeroing "m", whether by
 	 * pmap_zero_page() or an earlier function, are visible before adding
 	 * "m" to the page table.  Otherwise, a page table walk by another
 	 * processor's MMU could see the mapping to "m" and a stale, non-zero
 	 * PTE within "m".
 	 */
 	dmb(ishst);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= (NUL2E + NUL1E)) {
 		pd_entry_t *l0;
 		vm_pindex_t l0index;
 
 		l0index = ptepindex - (NUL2E + NUL1E);
 		l0 = &pmap->pm_l0[l0index];
 		pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE);
 	} else if (ptepindex >= NUL2E) {
 		vm_pindex_t l0index, l1index;
 		pd_entry_t *l0, *l1;
 		pd_entry_t tl0;
 
 		l1index = ptepindex - NUL2E;
 		l0index = l1index >> L0_ENTRIES_SHIFT;
 
 		l0 = &pmap->pm_l0[l0index];
 		tl0 = pmap_load(l0);
 		if (tl0 == 0) {
 			/* recurse for allocating page dir */
 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
 			l1pg->wire_count++;
 		}
 
 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
 		pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
 	} else {
 		vm_pindex_t l0index, l1index;
 		pd_entry_t *l0, *l1, *l2;
 		pd_entry_t tl0, tl1;
 
 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
 		l0index = l1index >> L0_ENTRIES_SHIFT;
 
 		l0 = &pmap->pm_l0[l0index];
 		tl0 = pmap_load(l0);
 		if (tl0 == 0) {
 			/* recurse for allocating page dir */
 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 			tl0 = pmap_load(l0);
 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
 			l1 = &l1[l1index & Ln_ADDR_MASK];
 		} else {
 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
 			l1 = &l1[l1index & Ln_ADDR_MASK];
 			tl1 = pmap_load(l1);
 			if (tl1 == 0) {
 				/* recurse for allocating page dir */
 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
 				    lockp) == NULL) {
 					vm_page_unwire_noq(m);
 					vm_page_free_zero(m);
 					return (NULL);
 				}
 			} else {
 				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
 				l2pg->wire_count++;
 			}
 		}
 
 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
 		pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	pd_entry_t *l1;
 	vm_page_t l2pg;
 	vm_pindex_t l2pindex;
 
 retry:
 	l1 = pmap_l1(pmap, va);
 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
 		/* Add a reference to the L2 page. */
 		l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK);
 		l2pg->wire_count++;
 	} else {
 		/* Allocate a L2 page. */
 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
 		if (l2pg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (l2pg);
 }
 
 static vm_page_t
 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pde, tpde;
 #ifdef INVARIANTS
 	pt_entry_t *pte;
 #endif
 	vm_page_t m;
 	int lvl;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_l2_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pde = pmap_pde(pmap, va, &lvl);
 
 	/*
 	 * If the page table page is mapped, we just increment the hold count,
 	 * and activate it. If we get a level 2 pde it will point to a level 3
 	 * table.
 	 */
 	switch (lvl) {
 	case -1:
 		break;
 	case 0:
 #ifdef INVARIANTS
 		pte = pmap_l0_to_l1(pde, va);
 		KASSERT(pmap_load(pte) == 0,
 		    ("pmap_alloc_l3: TODO: l0 superpages"));
 #endif
 		break;
 	case 1:
 #ifdef INVARIANTS
 		pte = pmap_l1_to_l2(pde, va);
 		KASSERT(pmap_load(pte) == 0,
 		    ("pmap_alloc_l3: TODO: l1 superpages"));
 #endif
 		break;
 	case 2:
 		tpde = pmap_load(pde);
 		if (tpde != 0) {
 			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
 			m->wire_count++;
 			return (m);
 		}
 		break;
 	default:
 		panic("pmap_alloc_l3: Invalid level %d", lvl);
 	}
 
 	/*
 	 * Here if the pte page isn't mapped, or if it has been deallocated.
 	 */
 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
 	if (m == NULL && lockp != NULL)
 		goto retry;
 
 	return (m);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));
 
 	vm_page_unwire_noq(m);
 	vm_page_free_zero(m);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
     0, 0, kvm_free, "LU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *l0, *l1, *l2;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	addr = roundup2(addr, L2_SIZE);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
 		KASSERT(pmap_load(l0) != 0,
 		    ("pmap_growkernel: No level 0 kernel entry"));
 
 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
 		if (pmap_load(l1) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			/* See the dmb() in _pmap_alloc_l3(). */
 			dmb(ishst);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			pmap_store(l1, paddr | L1_TABLE);
 			continue; /* try again */
 		}
 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
 		if (pmap_load(l2) != 0) {
 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		/* See the dmb() in _pmap_alloc_l3(). */
 		dmb(ishst);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		pmap_store(l2, paddr | L2_TABLE);
 
 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #if 0
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 #endif /* 0 */
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t next_pmap, pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed, lvl;
 	static int active_reclaims = 0;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 
 	pmap = NULL;
 	m_pc = NULL;
 	SLIST_INIT(&free);
 	bzero(&pc_marker_b, sizeof(pc_marker_b));
 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 	pc_marker = (struct pv_chunk *)&pc_marker_b;
 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 
 	mtx_lock(&pv_chunks_mutex);
 	active_reclaims++;
 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 	    SLIST_EMPTY(&free)) {
 		next_pmap = pc->pc_pmap;
 		if (next_pmap == NULL) {
 			/*
 			 * The next chunk is a marker.  However, it is
 			 * not our marker, so active_reclaims must be
 			 * > 1.  Consequently, the next_chunk code
 			 * will not rotate the pv_chunks list.
 			 */
 			goto next_chunk;
 		}
 		mtx_unlock(&pv_chunks_mutex);
 
 		/*
 		 * A pv_chunk can only be removed from the pc_lru list
 		 * when both pv_chunks_mutex is owned and the
 		 * corresponding pmap is locked.
 		 */
 		if (pmap != next_pmap) {
 			if (pmap != NULL && pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 			pmap = next_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap) {
 				RELEASE_PV_LIST_LOCK(lockp);
 				PMAP_LOCK(pmap);
 				mtx_lock(&pv_chunks_mutex);
 				continue;
 			} else if (pmap != locked_pmap) {
 				if (PMAP_TRYLOCK(pmap)) {
 					mtx_lock(&pv_chunks_mutex);
 					continue;
 				} else {
 					pmap = NULL; /* pmap is not locked */
 					mtx_lock(&pv_chunks_mutex);
 					pc = TAILQ_NEXT(pc_marker, pc_lru);
 					if (pc == NULL ||
 					    pc->pc_pmap != next_pmap)
 						continue;
 					goto next_chunk;
 				}
 			}
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = ffsl(inuse) - 1;
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va, &lvl);
 				if (lvl != 2)
 					continue;
 				pte = pmap_l2_to_l3(pde, va);
 				tpte = pmap_load(pte);
 				if ((tpte & ATTR_SW_WIRED) != 0)
 					continue;
 				tpte = pmap_load_clear(pte);
 				pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
 				if (pmap_pte_dirty(tpte))
 					vm_page_dirty(m);
 				if ((tpte & ATTR_AF) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			mtx_lock(&pv_chunks_mutex);
 			goto next_chunk;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap_resident_count_dec(pmap, freed);
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 		    pc->pc_map[2] == PC_FREE2) {
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m_pc->phys_addr);
 			mtx_lock(&pv_chunks_mutex);
 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 			break;
 		}
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		mtx_lock(&pv_chunks_mutex);
 		/* One freed pv entry in locked_pmap is sufficient. */
 		if (pmap == locked_pmap)
 			break;
 
 next_chunk:
 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
 		if (active_reclaims == 1 && pmap != NULL) {
 			/*
 			 * Rotate the pv chunks list so that we do not
 			 * scan the same pv chunks that could not be
 			 * freed (because they contained a wired
 			 * and/or superpage mapping) on every
 			 * invocation of reclaim_pv_chunk().
 			 */
 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
 				MPASS(pc->pc_pmap != NULL);
 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 			}
 		}
 	}
 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
 	active_reclaims--;
 	mtx_unlock(&pv_chunks_mutex);
 	if (pmap != NULL && pmap != locked_pmap)
 		PMAP_UNLOCK(pmap);
 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free;
 	bool reclaimed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 		bit_count((bitstr_t *)pc->pc_map, 0,
 		    sizeof(pc->pc_map) * NBBY, &free);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		}
 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void
 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 	int bit, field;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((va & L2_OFFSET) == 0,
 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
 	KASSERT((pa & L2_OFFSET) == 0,
 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field]) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_l2: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
 }
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = l2e & ~ATTR_MASK;
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	return (true);
 }
 
 static void
 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
 {
 	pt_entry_t newl2, oldl2;
 	vm_page_t ml3;
 	vm_paddr_t ml3pa;
 
 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	ml3 = pmap_remove_pt_page(pmap, va);
 	if (ml3 == NULL)
 		panic("pmap_remove_kernel_l2: Missing pt page");
 
 	ml3pa = VM_PAGE_TO_PHYS(ml3);
 	newl2 = ml3pa | L2_TABLE;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (ml3->valid != 0)
 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
 
 	/*
 	 * Demote the mapping.  The caller must have already invalidated the
 	 * mapping (i.e., the "break" in break-before-make).
 	 */
 	oldl2 = pmap_load_store(l2, newl2);
 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
 	    __func__, l2, oldl2));
 }
 
 /*
  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
  */
 static int
 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t old_l2;
 	vm_offset_t eva, va;
 	vm_page_t m, ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
 	old_l2 = pmap_load_clear(l2);
 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
 
 	/*
 	 * Since a promotion must break the 4KB page mappings before making
 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
 	 */
 	pmap_invalidate_page(pmap, sva);
 
 	if (old_l2 & ATTR_SW_WIRED)
 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
 	if (old_l2 & ATTR_SW_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
 		pvh = pa_to_pvh(old_l2 & ~ATTR_MASK);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + L2_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if (pmap_pte_dirty(old_l2))
 				vm_page_dirty(m);
 			if (old_l2 & ATTR_AF)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_l2(pmap, l2, sva);
 	} else {
 		ml3 = pmap_remove_pt_page(pmap, sva);
 		if (ml3 != NULL) {
 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_l2: l3 page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(ml3->wire_count == NL3PG,
 			    ("pmap_remove_l2: l3 page wire count error"));
 			ml3->wire_count = 0;
 			pmap_add_delayed_free_list(ml3, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, l1e, free));
 }
 
 /*
  * pmap_remove_l3: do the things to unmap a page in a process
  */
 static int
 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t old_l3;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	old_l3 = pmap_load_clear(l3);
 	pmap_invalidate_page(pmap, va);
 	if (old_l3 & ATTR_SW_WIRED)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (old_l3 & ATTR_SW_MANAGED) {
 		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
 		if (pmap_pte_dirty(old_l3))
 			vm_page_dirty(m);
 		if (old_l3 & ATTR_AF)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, va, l2e, free));
 }
 
 /*
  * Remove the specified range of addresses from the L3 page table that is
  * identified by the given L2 entry.
  */
 static void
 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct rwlock *new_lock;
 	pt_entry_t *l3, old_l3;
 	vm_offset_t va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
 	va = eva;
 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
 		if (!pmap_l3_valid(pmap_load(l3))) {
 			if (va != eva) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = eva;
 			}
 			continue;
 		}
 		old_l3 = pmap_load_clear(l3);
 		if ((old_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count--;
 		pmap_resident_count_dec(pmap, 1);
 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
 			m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
 			if (pmap_pte_dirty(old_l3))
 				vm_page_dirty(m);
 			if ((old_l3 & ATTR_AF) != 0)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m));
 			if (new_lock != *lockp) {
 				if (*lockp != NULL) {
 					/*
 					 * Pending TLB invalidations must be
 					 * performed before the PV list lock is
 					 * released.  Otherwise, a concurrent
 					 * pmap_remove_all() on a physical page
 					 * could return while a stale TLB entry
 					 * still provides access to that page. 
 					 */
 					if (va != eva) {
 						pmap_invalidate_range(pmap, va,
 						    sva);
 						va = eva;
 					}
 					rw_wunlock(*lockp);
 				}
 				*lockp = new_lock;
 				rw_wlock(*lockp);
 			}
 			pmap_pvh_free(&m->md, pmap, sva);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    (m->flags & PG_FICTITIOUS) == 0) {
 				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 				if (TAILQ_EMPTY(&pvh->pv_list))
 					vm_page_aflag_clear(m, PGA_WRITEABLE);
 			}
 		}
 		if (va == eva)
 			va = sva;
 		if (pmap_unuse_pt(pmap, sva, l2e, free)) {
 			sva += L3_SIZE;
 			break;
 		}
 	}
 	if (va != eva)
 		pmap_invalidate_range(pmap, va, sva);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct rwlock *lock;
 	vm_offset_t va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t l3_paddr;
 	struct spglist free;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	SLIST_INIT(&free);
 
 	PMAP_LOCK(pmap);
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL)
 			continue;
 
 		l3_paddr = pmap_load(l2);
 
 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
 				    &free, &lock);
 				continue;
 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
 			    &lock) == NULL)
 				continue;
 			l3_paddr = pmap_load(l2);
 		}
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
 			continue;
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
 		    &lock);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t *pde, tpde;
 	pt_entry_t *pte, tpte;
 	vm_offset_t va;
 	struct spglist free;
 	int lvl, pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry:
 	rw_wlock(lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pte = pmap_pte(pmap, va, &lvl);
 		KASSERT(pte != NULL,
 		    ("pmap_remove_all: no page table entry found"));
 		KASSERT(lvl == 2,
 		    ("pmap_remove_all: invalid pte level %d", lvl));
 
 		pmap_demote_l2_locked(pmap, pte, va, &lock);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		pmap_resident_count_dec(pmap, 1);
 
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_remove_all: no page directory entry found"));
 		KASSERT(lvl == 2,
 		    ("pmap_remove_all: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 
 		pte = pmap_l2_to_l3(pde, pv->pv_va);
 		tpte = pmap_load_clear(pte);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		if (tpte & ATTR_SW_WIRED)
 			pmap->pm_stats.wired_count--;
 		if ((tpte & ATTR_AF) != 0)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (pmap_pte_dirty(tpte))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * pmap_protect_l2: do the things to protect a 2MB page in a pmap
  */
 static void
 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
     pt_entry_t nbits)
 {
 	pd_entry_t old_l2;
 	vm_page_t m, mt;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L2_OFFSET) == 0,
 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
 	old_l2 = pmap_load(l2);
 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
 
 	/*
 	 * Return if the L2 entry already has the desired access restrictions
 	 * in place.
 	 */
 retry:
 	if ((old_l2 & mask) == nbits)
 		return;
 
 	/*
 	 * When a dirty read/write superpage mapping is write protected,
 	 * update the dirty field of each of the superpage's constituent 4KB
 	 * pages.
 	 */
 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
 	    (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(old_l2)) {
 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 			vm_page_dirty(mt);
 	}
 
 	if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
 		goto retry;
 
 	/*
 	 * Since a promotion must break the 4KB page mappings before making
 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
 	 */
 	pmap_invalidate_page(pmap, sva);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va, va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t *l3p, l3, mask, nbits;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	mask = nbits = 0;
 	if ((prot & VM_PROT_WRITE) == 0) {
 		mask |= ATTR_AP_RW_BIT | ATTR_SW_DBM;
 		nbits |= ATTR_AP(ATTR_AP_RO);
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0) {
 		mask |= ATTR_XN;
 		nbits |= ATTR_XN;
 	}
 	if (mask == 0)
 		return;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (pmap_load(l2) == 0)
 			continue;
 
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
 				continue;
 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
 				continue;
 		}
 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_protect: Invalid L2 entry after demotion"));
 
 		if (va_next > eva)
 			va_next = eva;
 
 		va = va_next;
 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
 		    sva += L3_SIZE) {
 			l3 = pmap_load(l3p);
 retry:
 			/*
 			 * Go to the next L3 entry if the current one is
 			 * invalid or already has the desired access
 			 * restrictions in place.  (The latter case occurs
 			 * frequently.  For example, in a "buildworld"
 			 * workload, almost 1 out of 4 L3 entries already
 			 * have the desired restrictions.)
 			 */
 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
 				if (va != va_next) {
 					pmap_invalidate_range(pmap, va, sva);
 					va = va_next;
 				}
 				continue;
 			}
 
 			/*
 			 * When a dirty read/write mapping is write protected,
 			 * update the page's dirty field.
 			 */
 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
 			    (nbits & ATTR_AP(ATTR_AP_RO)) != 0 &&
 			    pmap_pte_dirty(l3))
 				vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK));
 
 			if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits))
 				goto retry;
 			if (va == va_next)
 				va = sva;
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "mpte" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
 }
 
 /*
  * Performs a break-before-make update of a pmap entry. This is needed when
  * either promoting or demoting pages to ensure the TLB doesn't get into an
  * inconsistent state.
  */
 static void
 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
     vm_offset_t va, vm_size_t size)
 {
 	register_t intr;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Ensure we don't get switched out with the page table in an
 	 * inconsistent state. We also need to ensure no interrupts fire
 	 * as they may make use of an address we are about to invalidate.
 	 */
 	intr = intr_disable();
 	critical_enter();
 
 	/*
 	 * Clear the old mapping's valid bit, but leave the rest of the entry
 	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
 	 * lookup the physical address.
 	 */
 	pmap_clear_bits(pte, ATTR_DESCR_VALID);
 	pmap_invalidate_range_nopin(pmap, va, va + size);
 
 	/* Create the new mapping */
 	pmap_store(pte, newpte);
 	dsb(ishst);
 
 	critical_exit();
 	intr_restore(intr);
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  * replace the many pv entries for the 4KB page mappings by a single pv entry
  * for the 2MB page mapping.
  */
 static void
 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	KASSERT((pa & L2_OFFSET) == 0,
 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 	 * a transfer avoids the possibility that get_pv_entry() calls
 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = va & ~L2_OFFSET;
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 
 /*
  * Tries to promote the 512, contiguous 4KB page mappings that are within a
  * single level 2 table entry to a single 2MB page mapping.  For promotion
  * to occur, two conditions must be met: (1) the 4KB page mappings must map
  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  * identical characteristics.
  */
 static void
 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
 	vm_page_t mpte;
 	vm_offset_t sva;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	sva = va & ~L2_OFFSET;
 	firstl3 = pmap_l2_to_l3(l2, sva);
 	newl2 = pmap_load(firstl3);
 
 setl2:
 	if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) {
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 
 	if ((newl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 	    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) {
 		if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM))
 			goto setl2;
 		newl2 &= ~ATTR_SW_DBM;
 	}
 
 	pa = newl2 + L2_SIZE - PAGE_SIZE;
 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
 		oldl3 = pmap_load(l3);
 setl3:
 		if ((oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 		    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) {
 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
 			    ~ATTR_SW_DBM))
 				goto setl3;
 			oldl3 &= ~ATTR_SW_DBM;
 		}
 		if (oldl3 != pa) {
 			atomic_add_long(&pmap_l2_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the L2
 	 * mapping the superpage is demoted by pmap_demote_l2() or
 	 * destroyed by pmap_remove_l3().
 	 */
 	mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_l2: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
 	    ("pmap_promote_l2: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		CTR2(KTR_PMAP,
 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	if ((newl2 & ATTR_SW_MANAGED) != 0)
 		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
 
 	newl2 &= ~ATTR_DESCR_MASK;
 	newl2 |= L2_BLOCK;
 
 	pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
 
 	atomic_add_long(&pmap_l2_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
 		    pmap);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t new_l3, orig_l3;
 	pt_entry_t *l2, *l3;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	boolean_t nosleep;
 	int lvl, rv;
 
 	va = trunc_page(va);
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	pa = VM_PAGE_TO_PHYS(m);
 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
 	    L3_PAGE);
 	if ((prot & VM_PROT_WRITE) == 0)
 		new_l3 |= ATTR_AP(ATTR_AP_RO);
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		new_l3 |= ATTR_XN;
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		new_l3 |= ATTR_SW_WIRED;
 	if (va < VM_MAXUSER_ADDRESS)
 		new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		new_l3 |= ATTR_SW_MANAGED;
 		if ((prot & VM_PROT_WRITE) != 0) {
 			new_l3 |= ATTR_SW_DBM;
 			if ((flags & VM_PROT_WRITE) == 0)
 				new_l3 |= ATTR_AP(ATTR_AP_RO);
 		}
 	}
 
 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */
 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
 		    flags, m, &lock);
 		goto out;
 	}
 	mpte = NULL;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 retry:
 	pde = pmap_pde(pmap, va, &lvl);
 	if (pde != NULL && lvl == 2) {
 		l3 = pmap_l2_to_l3(pde, va);
 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 			mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
 			mpte->wire_count++;
 		}
 		goto havel3;
 	} else if (pde != NULL && lvl == 1) {
 		l2 = pmap_l1_to_l2(pde, va);
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
 			l3 = &l3[pmap_l3_index(va)];
 			if (va < VM_MAXUSER_ADDRESS) {
 				mpte = PHYS_TO_VM_PAGE(
 				    pmap_load(l2) & ~ATTR_MASK);
 				mpte->wire_count++;
 			}
 			goto havel3;
 		}
 		/* We need to allocate an L3 table. */
 	}
 	if (va < VM_MAXUSER_ADDRESS) {
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 
 		/*
 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
 		 * to handle the possibility that a superpage mapping for "va"
 		 * was created while we slept.
 		 */
 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
 		    nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
 			rv = KERN_RESOURCE_SHORTAGE;
 			goto out;
 		}
 		goto retry;
 	} else
 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
 
 havel3:
 	orig_l3 = pmap_load(l3);
 	opa = orig_l3 & ~ATTR_MASK;
 	pv = NULL;
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if (pmap_l3_valid(orig_l3)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
 		    (orig_l3 & ATTR_SW_WIRED) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
 		    (orig_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
 			    (new_l3 & ATTR_SW_DBM) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.
 		 */
 		orig_l3 = pmap_load_clear(l3);
 		KASSERT((orig_l3 & ~ATTR_MASK) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if (pmap_pte_dirty(orig_l3))
 				vm_page_dirty(om);
 			if ((orig_l3 & ATTR_AF) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			if ((m->oflags & VPO_UNMANAGED) != 0)
 				free_pv_entry(pmap, pv);
 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		pmap_invalidate_page(pmap, va);
 		orig_l3 = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((new_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((new_l3 & ATTR_SW_DBM) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 validate:
 	/*
 	 * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK
 	 * is set. Do it now, before the mapping is stored and made
 	 * valid for hardware table walk. If done later, then other can
 	 * access this page before caches are properly synced.
 	 * Don't do it for kernel memory which is mapped with exec
 	 * permission even if the memory isn't going to hold executable
 	 * code. The only time when icache sync is needed is after
 	 * kernel module is loaded and the relocation info is processed.
 	 * And it's done in elf_cpu_load_file().
 	*/
 	if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
 	    (opa != pa || (orig_l3 & ATTR_XN)))
 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
 
 	/*
 	 * Update the L3 entry
 	 */
 	if (pmap_l3_valid(orig_l3)) {
 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
 			/* same PA, different attributes */
 			/* XXXMJ need to reload orig_l3 for hardware DBM. */
 			pmap_load_store(l3, new_l3);
 			pmap_invalidate_page(pmap, va);
 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
 			    pmap_pte_dirty(orig_l3))
 				vm_page_dirty(m);
 		} else {
 			/*
 			 * orig_l3 == new_l3
 			 * This can happens if multiple threads simultaneously
 			 * access not yet mapped page. This bad for performance
 			 * since this can cause full demotion-NOP-promotion
 			 * cycle.
 			 * Another possible reasons are:
 			 * - VM and pmap memory layout are diverged
 			 * - tlb flush is missing somewhere and CPU doesn't see
 			 *   actual mapping.
 			 */
 			CTR4(KTR_PMAP, "%s: already mapped page - "
 			    "pmap %p va 0x%#lx pte 0x%lx",
 			    __func__, pmap, va, new_l3);
 		}
 	} else {
 		/* New mapping */
 		pmap_store(l3, new_l3);
 		dsb(ishst);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if ((mpte == NULL || mpte->wire_count == NL3PG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0) {
 		pmap_promote_l2(pmap, pde, va, &lock);
 	}
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t new_l2;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
 	    ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L2_BLOCK);
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		new_l2 |= ATTR_SW_MANAGED;
 		new_l2 &= ~ATTR_AF;
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		new_l2 |= ATTR_XN;
 	if (va < VM_MAXUSER_ADDRESS)
 		new_l2 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t *l2, old_l2;
 	vm_page_t l2pg, mt;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 	    NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
 		    va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
 	l2 = &l2[pmap_l2_index(va)];
 	if ((old_l2 = pmap_load(l2)) != 0) {
 		KASSERT(l2pg->wire_count > 1,
 		    ("pmap_enter_l2: l2pg's wire count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			l2pg->wire_count--;
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_FAILURE);
 		}
 		SLIST_INIT(&free);
 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
 			(void)pmap_remove_l2(pmap, l2, va,
 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
 		else
 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
 			    &free, lockp);
 		vm_page_free_pages_toq(&free, true);
 		if (va >= VM_MAXUSER_ADDRESS) {
 			/*
 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
 			 * will leave the kernel page table page zero filled.
 			 * Nonetheless, the TLB could have an intermediate
 			 * entry for the kernel page table page.
 			 */
 			mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_l2: trie insert failed");
 			pmap_clear(l2);
 			pmap_invalidate_page(pmap, va);
 		} else
 			KASSERT(pmap_load(l2) == 0,
 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
 	}
 
 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_l3(pmap, va, l2pg, &free)) {
 				/*
 				 * Although "va" is not mapped, the TLB could
 				 * nonetheless have intermediate entries that
 				 * refer to the freed page table pages.
 				 * Invalidate those entries.
 				 *
 				 * XXX redundant invalidation (See
 				 * _pmap_unwire_l3().)
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((new_l2 & ATTR_SW_DBM) != 0)
 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((new_l2 & ATTR_SW_WIRED) != 0)
 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
 
 	/*
 	 * Map the superpage.
 	 */
 	pmap_store(l2, new_l2);
 	dsb(ishst);
 
 	atomic_add_long(&pmap_l2_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
 	    va, pmap);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[L2_SIZE / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
 			    &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t *pde;
 	pt_entry_t *l2, *l3, l3_val;
 	vm_paddr_t pa;
 	int lvl;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t l2pindex;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		l2pindex = pmap_l2_pindex(va);
 		if (mpte && (mpte->pindex == l2pindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the l2 entry
 			 */
 			pde = pmap_pde(pmap, va, &lvl);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (lvl == 1) {
 				l2 = pmap_l1_to_l2(pde, va);
 				if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
 				    L2_BLOCK)
 					return (NULL);
 			}
 			if (lvl == 2 && pmap_load(pde) != 0) {
 				mpte =
 				    PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
 				mpte->wire_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		l3 = &l3[pmap_l3_index(va)];
 	} else {
 		mpte = NULL;
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
 		     va));
 		KASSERT(lvl == 2,
 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
 		l3 = pmap_l2_to_l3(pde, va);
 	}
 
 	/*
 	 * Abort if a mapping already exists.
 	 */
 	if (pmap_load(l3) != 0) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_l3(pmap, va, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	pa = VM_PAGE_TO_PHYS(m);
 	l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
 	    ATTR_AP(ATTR_AP_RO) | L3_PAGE;
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		l3_val |= ATTR_XN;
 	else if (va < VM_MAXUSER_ADDRESS)
 		l3_val |= ATTR_PXN;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		l3_val |= ATTR_SW_MANAGED;
 		l3_val &= ~ATTR_AF;
 	}
 
 	/* Sync icache before the mapping is stored to PTE */
 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
 
 	pmap_store(l3, l3_val);
 	dsb(ishst);
 
 	return (mpte);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t *l3;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (pmap_load(l2) == 0)
 			continue;
 
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
 				panic("pmap_unwire: l2 %#jx is missing "
 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_clear_bits(l2, ATTR_SW_WIRED);
 				pmap->pm_stats.wired_count -= L2_SIZE /
 				    PAGE_SIZE;
 				continue;
 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
 				panic("pmap_unwire: demotion failed");
 		}
 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_unwire: Invalid l2 entry after demotion"));
 
 		if (va_next > eva)
 			va_next = eva;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			if (pmap_load(l3) == 0)
 				continue;
 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
 				panic("pmap_unwire: l3 %#jx is missing "
 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
 
 			/*
 			 * ATTR_SW_WIRED must be cleared atomically.  Although
 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
 			 * the System MMU may write to the entry concurrently.
 			 */
 			pmap_clear_bits(l3, ATTR_SW_WIRED);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  *
  *	Because the executable mappings created by this routine are copied,
  *	it should not have to flush the instruction cache.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct rwlock *lock;
 	struct spglist free;
 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
 	vm_offset_t addr, end_addr, va_next;
 	vm_page_t dst_l2pg, dstmpte, srcmpte;
 
 	if (dst_addr != src_addr)
 		return;
 	end_addr = src_addr + len;
 	lock = NULL;
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		l0 = pmap_l0(src_pmap, addr);
 		if (pmap_load(l0) == 0) {
 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 		l1 = pmap_l0_to_l1(l0, addr);
 		if (pmap_load(l1) == 0) {
 			va_next = (addr + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < addr)
 			va_next = end_addr;
 		l2 = pmap_l1_to_l2(l1, addr);
 		srcptepaddr = pmap_load(l2);
 		if (srcptepaddr == 0)
 			continue;
 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if ((addr & L2_OFFSET) != 0 ||
 			    addr + L2_SIZE > end_addr)
 				continue;
 			dst_l2pg = pmap_alloc_l2(dst_pmap, addr, NULL);
 			if (dst_l2pg == NULL)
 				break;
 			l2 = (pd_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_l2pg));
 			l2 = &l2[pmap_l2_index(addr)];
 			if (pmap_load(l2) == 0 &&
 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
 			    PMAP_ENTER_NORECLAIM, &lock))) {
 				mask = ATTR_AF | ATTR_SW_WIRED;
 				nbits = 0;
 				if ((srcptepaddr & ATTR_SW_DBM) != 0)
 					nbits |= ATTR_AP_RW_BIT;
 				pmap_store(l2, (srcptepaddr & ~mask) | nbits);
 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
 				    PAGE_SIZE);
 				atomic_add_long(&pmap_l2_mappings, 1);
 			} else
 				dst_l2pg->wire_count--;
 			continue;
 		}
 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_copy: invalid L2 entry"));
 		srcptepaddr &= ~ATTR_MASK;
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		KASSERT(srcmpte->wire_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 		if (va_next > end_addr)
 			va_next = end_addr;
 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 		src_pte = &src_pte[pmap_l3_index(addr)];
 		dstmpte = NULL;
 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
 			ptetemp = pmap_load(src_pte);
 
 			/*
 			 * We only virtual copy managed pages.
 			 */
 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
 				continue;
 
 			if (dstmpte != NULL) {
 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
 				    ("dstmpte pindex/addr mismatch"));
 				dstmpte->wire_count++;
 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
 			    NULL)) == NULL)
 				goto out;
 			dst_pte = (pt_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 			dst_pte = &dst_pte[pmap_l3_index(addr)];
 			if (pmap_load(dst_pte) == 0 &&
 			    pmap_try_insert_pv_entry(dst_pmap, addr,
 			    PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) {
 				/*
 				 * Clear the wired, modified, and accessed
 				 * (referenced) bits during the copy.
 				 */
 				mask = ATTR_AF | ATTR_SW_WIRED;
 				nbits = 0;
 				if ((ptetemp & ATTR_SW_DBM) != 0)
 					nbits |= ATTR_AP_RW_BIT;
 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
 				pmap_resident_count_inc(dst_pmap, 1);
 			} else {
 				SLIST_INIT(&free);
 				if (pmap_unwire_l3(dst_pmap, addr, dstmpte,
 				    &free)) {
 					/*
 					 * Although "addr" is not mapped,
 					 * the TLB could nonetheless have
 					 * intermediate entries that refer
 					 * to the freed page table pages.
 					 * Invalidate those entries.
 					 *
 					 * XXX redundant invalidation
 					 */
 					pmap_invalidate_page(dst_pmap, addr);
 					vm_page_free_pages_toq(&free, true);
 				}
 				goto out;
 			}
 			/* Have we copied all of the valid mappings? */ 
 			if (dstmpte->wire_count >= srcmpte->wire_count)
 				break;
 		}
 	}
 out:
 	/*
 	 * XXX This barrier may not be needed because the destination pmap is
 	 * not active.
 	 */
 	dsb(ishst);
 
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t m_a, m_b;
 	vm_paddr_t p_a, p_b;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	int cnt;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		m_a = ma[a_offset >> PAGE_SHIFT];
 		p_a = m_a->phys_addr;
 		b_pg_offset = b_offset & PAGE_MASK;
 		m_b = mb[b_offset >> PAGE_SHIFT];
 		p_b = m_b->phys_addr;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
 			panic("!DMAP a %lx", p_a);
 		} else {
 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
 		}
 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
 			panic("!DMAP b %lx", p_b);
 		} else {
 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
 		}
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 
 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct rwlock *lock;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 	int count, lvl, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
 			if (pte != NULL &&
 			    (pmap_load(pte) & ATTR_SW_WIRED) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	return (count);
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, tpte;
 	struct spglist free;
 	vm_page_t m, ml3, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx, lvl;
 	vm_paddr_t pa;
 
 	lock = NULL;
 
 	SLIST_INIT(&free);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = ffsl(inuse) - 1;
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
 				KASSERT(pde != NULL,
 				    ("Attempting to remove an unmapped page"));
 
 				switch(lvl) {
 				case 1:
 					pte = pmap_l1_to_l2(pde, pv->pv_va);
 					tpte = pmap_load(pte); 
 					KASSERT((tpte & ATTR_DESCR_MASK) ==
 					    L2_BLOCK,
 					    ("Attempting to remove an invalid "
 					    "block: %lx", tpte));
 					tpte = pmap_load(pte);
 					break;
 				case 2:
 					pte = pmap_l2_to_l3(pde, pv->pv_va);
 					tpte = pmap_load(pte);
 					KASSERT((tpte & ATTR_DESCR_MASK) ==
 					    L3_PAGE,
 					    ("Attempting to remove an invalid "
 					     "page: %lx", tpte));
 					break;
 				default:
 					panic(
 					    "Invalid page directory level: %d",
 					    lvl);
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & ATTR_SW_WIRED) {
 					allfree = 0;
 					continue;
 				}
 
 				pa = tpte & ~ATTR_MASK;
 
 				m = PHYS_TO_VM_PAGE(pa);
 				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad pte %#jx",
 				    (uintmax_t)tpte));
 
 				/*
 				 * Because this pmap is not active on other
 				 * processors, the dirty bit cannot have
 				 * changed state since we last loaded pte.
 				 */
 				pmap_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if (pmap_pte_dirty(tpte)) {
 					switch (lvl) {
 					case 1:
 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 						break;
 					case 2:
 						vm_page_dirty(m);
 						break;
 					}
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 				switch (lvl) {
 				case 1:
 					pmap_resident_count_dec(pmap,
 					    L2_SIZE / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & ~ATTR_MASK);
 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
 					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					ml3 = pmap_remove_pt_page(pmap,
 					    pv->pv_va);
 					if (ml3 != NULL) {
 						KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
 						    ("pmap_remove_pages: l3 page not promoted"));
 						pmap_resident_count_dec(pmap,1);
 						KASSERT(ml3->wire_count == NL3PG,
 						    ("pmap_remove_pages: l3 page wire count error"));
 						ml3->wire_count = 0;
 						pmap_add_delayed_free_list(ml3,
 						    &free, FALSE);
 					}
 					break;
 				case 2:
 					pmap_resident_count_dec(pmap, 1);
 					TAILQ_REMOVE(&m->md.pv_list, pv,
 					    pv_next);
 					m->md.pv_gen++;
 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(
 						    VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m,
 							    PGA_WRITEABLE);
 					}
 					break;
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
 				    &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	pmap_invalidate_all(pmap);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * This is used to check if a page has been accessed or modified. As we
  * don't have a bit to see if it has been modified we have to assume it
  * has been if the page is read/write.
  */
 static boolean_t
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct rwlock *lock;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	pt_entry_t *pte, mask, value;
 	pmap_t pmap;
 	int lvl, md_gen, pvh_gen;
 	boolean_t rv;
 
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		KASSERT(lvl == 3,
 		    ("pmap_page_test_mappings: Invalid level %d", lvl));
 		mask = 0;
 		value = 0;
 		if (modified) {
 			mask |= ATTR_AP_RW_BIT;
 			value |= ATTR_AP(ATTR_AP_RW);
 		}
 		if (accessed) {
 			mask |= ATTR_AF | ATTR_DESCR_MASK;
 			value |= ATTR_AF | L3_PAGE;
 		}
 		rv = (pmap_load(pte) & mask) == value;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
 			KASSERT(lvl == 2,
 			    ("pmap_page_test_mappings: Invalid level %d", lvl));
 			mask = 0;
 			value = 0;
 			if (modified) {
 				mask |= ATTR_AP_RW_BIT;
 				value |= ATTR_AP(ATTR_AP_RW);
 			}
 			if (accessed) {
 				mask |= ATTR_AF | ATTR_DESCR_MASK;
 				value |= ATTR_AF | L2_BLOCK;
 			}
 			rv = (pmap_load(pte) & mask) == value;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *pte;
 	boolean_t rv;
 	int lvl;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, addr, &lvl);
 	if (pte != NULL && pmap_load(pte) != 0) {
 		rv = TRUE;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pt_entry_t oldpte, *pte;
 	vm_offset_t va;
 	int lvl, md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		va = pv->pv_va;
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen ||
 			    md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		oldpte = pmap_load(pte);
 retry:
 		if ((oldpte & ATTR_SW_DBM) != 0) {
 			if (!atomic_fcmpset_long(pte, &oldpte,
 			    (oldpte | ATTR_AP_RW_BIT) & ~ATTR_SW_DBM))
 				goto retry;
 			if ((oldpte & ATTR_AP_RW_BIT) ==
 			    ATTR_AP(ATTR_AP_RW))
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t *pde, tpde;
 	pt_entry_t *pte, tpte;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int cleared, lvl, md_gen, not_cleared, pvh_gen;
 	struct spglist free;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
 		KASSERT(lvl == 1,
 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 		KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
 		    ("pmap_ts_referenced: found an invalid l1 table"));
 		pte = pmap_l1_to_l2(pde, pv->pv_va);
 		tpte = pmap_load(pte);
 		if (pmap_pte_dirty(tpte)) {
 			/*
 			 * Although "tpte" is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 
 		if ((tpte & ATTR_AF) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB pages,
 			 * it should not be cleared every time it is tested.
 			 * Apply a simple "hash" function on the physical page
 			 * number, the virtual superpage number, and the pmap
 			 * address to select one 4KB page out of the 512 on
 			 * which testing the reference bit will result in
 			 * clearing that reference bit.  This function is
 			 * designed to avoid the selection of the same 4KB page
 			 * for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
 			    (tpte & ATTR_SW_WIRED) == 0) {
 				pmap_clear_bits(pte, ATTR_AF);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
 		KASSERT(lvl == 2,
 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 		KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_ts_referenced: found an invalid l2 table"));
 		pte = pmap_l2_to_l3(pde, pv->pv_va);
 		tpte = pmap_load(pte);
 		if (pmap_pte_dirty(tpte))
 			vm_page_dirty(m);
 		if ((tpte & ATTR_AF) != 0) {
 			if ((tpte & ATTR_SW_WIRED) == 0) {
 				pmap_clear_bits(pte, ATTR_AF);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	struct rwlock *lock;
 	vm_offset_t va, va_next;
 	vm_page_t m;
 	pd_entry_t *l0, *l1, *l2, oldl2;
 	pt_entry_t *l3, oldl3;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 		l2 = pmap_l1_to_l2(l1, sva);
 		oldl2 = pmap_load(l2);
 		if (oldl2 == 0)
 			continue;
 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
 				continue;
 			lock = NULL;
 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
 				if (lock != NULL)
 					rw_wunlock(lock);
 
 				/*
 				 * The 2MB page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Choosing the last page
 			 * within the address range [sva, min(va_next, eva))
 			 * generally results in more repromotions.  Since the
 			 * underlying page table page is fully populated, this
 			 * removal never frees a page table page.
 			 */
 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
 				va = eva;
 				if (va > va_next)
 					va = va_next;
 				va -= PAGE_SIZE;
 				KASSERT(va >= sva,
 				    ("pmap_advise: no address gap"));
 				l3 = pmap_l2_to_l3(l2, va);
 				KASSERT(pmap_load(l3) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
 				    NULL, &lock);
 			}
 			if (lock != NULL)
 				rw_wunlock(lock);
 		}
 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_advise: invalid L2 entry after demotion"));
 		if (va_next > eva)
 			va_next = eva;
 		va = va_next;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			oldl3 = pmap_load(l3);
 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
 			    (ATTR_SW_MANAGED | L3_PAGE))
 				goto maybe_invlrng;
 			else if (pmap_pte_dirty(oldl3)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK);
 					vm_page_dirty(m);
 				}
 				while (!atomic_fcmpset_long(l3, &oldl3,
 				    (oldl3 & ~ATTR_AF) | ATTR_AP(ATTR_AP_RO)))
 					cpu_spinwait();
 			} else if ((oldl3 & ATTR_AF) != 0)
 				pmap_clear_bits(l3, ATTR_AF);
 			else
 				goto maybe_invlrng;
 			if (va == va_next)
 				va = sva;
 			continue;
 maybe_invlrng:
 			if (va != va_next) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = va_next;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *l2, oldl2;
 	pt_entry_t *l3, oldl3;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have ATTR_SW_DBM
 	 * set.  If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		oldl2 = pmap_load(l2);
 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
 		    (oldl2 & ATTR_SW_WIRED) == 0) {
 			/*
 			 * Write protect the mapping to a single page so that
 			 * a subsequent write access may repromote.
 			 */
 			va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK);
 			l3 = pmap_l2_to_l3(l2, va);
 			oldl3 = pmap_load(l3);
 			while (!atomic_fcmpset_long(l3, &oldl3,
 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_AP(ATTR_AP_RO)))
 				cpu_spinwait();
 			vm_page_dirty(m);
 			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		oldl3 = pmap_load(l3);
 		if (pmap_l3_valid(oldl3) &&
 		    (oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
 			pmap_set_bits(l3, ATTR_AP(ATTR_AP_RO));
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	pd_entry_t *pde;
 	pt_entry_t *l2;
 	int i, lvl, l2_blocks, free_l2_count, start_idx;
 
 	if (!vm_initialized) {
 		/*
 		 * No L3 ptables so map entire L2 blocks where start VA is:
 		 * 	preinit_map_va + start_idx * L2_SIZE
 		 * There may be duplicate mappings (multiple VA -> same PA) but
 		 * ARM64 dcache is always PIPT so that's acceptable.
 		 */
 		 if (size == 0)
 			 return (NULL);
 
 		 /* Calculate how many L2 blocks are needed for the mapping */
 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
 
 		offset = pa & L2_OFFSET;
 
 		if (preinit_map_va == 0)
 			return (NULL);
 
 		/* Map 2MiB L2 blocks from reserved VA space */
 
 		free_l2_count = 0;
 		start_idx = -1;
 		/* Find enough free contiguous VA space */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (free_l2_count > 0 && ppim->pa != 0) {
 				/* Not enough space here */
 				free_l2_count = 0;
 				start_idx = -1;
 				continue;
 			}
 
 			if (ppim->pa == 0) {
 				/* Free L2 block */
 				if (start_idx == -1)
 					start_idx = i;
 				free_l2_count++;
 				if (free_l2_count == l2_blocks)
 					break;
 			}
 		}
 		if (free_l2_count != l2_blocks)
 			panic("%s: too many preinit mappings", __func__);
 
 		va = preinit_map_va + (start_idx * L2_SIZE);
 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
 			/* Mark entries as allocated */
 			ppim = pmap_preinit_mapping + i;
 			ppim->pa = pa;
 			ppim->va = va + offset;
 			ppim->size = size;
 		}
 
 		/* Map L2 blocks */
 		pa = rounddown2(pa, L2_SIZE);
 		for (i = 0; i < l2_blocks; i++) {
 			pde = pmap_pde(kernel_pmap, va, &lvl);
 			KASSERT(pde != NULL,
 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
 			    va));
 			KASSERT(lvl == 1,
 			    ("pmap_mapbios: Invalid level %d", lvl));
 
 			/* Insert L2_BLOCK */
 			l2 = pmap_l1_to_l2(pde, va);
 			pmap_load_store(l2,
 			    pa | ATTR_DEFAULT | ATTR_XN |
 			    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 
 			va += L2_SIZE;
 			pa += L2_SIZE;
 		}
 		pmap_invalidate_all(kernel_pmap);
 
 		va = preinit_map_va + (start_idx * L2_SIZE);
 
 	} else {
 		/* kva_alloc may be used to map the pages */
 		offset = pa & PAGE_MASK;
 		size = round_page(offset + size);
 
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
 
 		/* L3 table is linked */
 		va = trunc_page(va);
 		pa = trunc_page(pa);
 		pmap_kenter(va, size, pa, CACHED_MEMORY);
 	}
 
 	return ((void *)(va + offset));
 }
 
 void
 pmap_unmapbios(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset, tmpsize, va_trunc;
 	pd_entry_t *pde;
 	pt_entry_t *l2;
 	int i, lvl, l2_blocks, block;
 	bool preinit_map;
 
 	l2_blocks =
 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
 
 	/* Remove preinit mapping */
 	preinit_map = false;
 	block = 0;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va) {
 			KASSERT(ppim->size == size,
 			    ("pmap_unmapbios: size mismatch"));
 			ppim->va = 0;
 			ppim->pa = 0;
 			ppim->size = 0;
 			preinit_map = true;
 			offset = block * L2_SIZE;
 			va_trunc = rounddown2(va, L2_SIZE) + offset;
 
 			/* Remove L2_BLOCK */
 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
 			KASSERT(pde != NULL,
 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
 			    va_trunc));
 			l2 = pmap_l1_to_l2(pde, va_trunc);
 			pmap_clear(l2);
 
 			if (block == (l2_blocks - 1))
 				break;
 			block++;
 		}
 	}
 	if (preinit_map) {
 		pmap_invalidate_all(kernel_pmap);
 		return;
 	}
 
 	/* Unmap the pages reserved with kva_alloc. */
 	if (vm_initialized) {
 		offset = va & PAGE_MASK;
 		size = round_page(offset + size);
 		va = trunc_page(va);
 
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
 
 		/* Unmap and invalidate the pages */
                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 			pmap_kremove(va + tmpsize);
 
 		kva_free(va, size);
 	}
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pv_memattr = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 	    m->md.pv_memattr) != 0)
 		panic("memory attribute change on the direct map failed");
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the direct map or the kernel map.  If
  * the virtual address range is contained within the kernel map, then the
  * memory type for each of the corresponding ranges of the direct map is also
  * changed.  (The corresponding ranges of the direct map are those ranges that
  * map the same physical pages as the specified virtual address range.)  These
  * changes to the direct map are necessary because Intel describes the
  * behavior of their processors as "undefined" if two or more mappings to the
  * same physical page have different memory types.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.  In the
  * latter case, the memory type may have been changed on some part of the
  * virtual address range or the direct map.
  */
 int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	int error;
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_attr_locked(va, size, mode);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	pt_entry_t l3, *pte, *newpte;
 	int lvl;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	if (!VIRT_IN_DMAP(base))
 		return (EINVAL);
 
 	for (tmpva = base; tmpva < base + size; ) {
 		pte = pmap_pte(kernel_pmap, tmpva, &lvl);
 		if (pte == NULL)
 			return (EINVAL);
 
 		if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) {
 			/*
 			 * We already have the correct attribute,
 			 * ignore this entry.
 			 */
 			switch (lvl) {
 			default:
 				panic("Invalid DMAP table level: %d\n", lvl);
 			case 1:
 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
 				break;
 			case 2:
 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
 				break;
 			case 3:
 				tmpva += PAGE_SIZE;
 				break;
 			}
 		} else {
 			/*
 			 * Split the entry to an level 3 table, then
 			 * set the new attribute.
 			 */
 			switch (lvl) {
 			default:
 				panic("Invalid DMAP table level: %d\n", lvl);
 			case 1:
 				newpte = pmap_demote_l1(kernel_pmap, pte,
 				    tmpva & ~L1_OFFSET);
 				if (newpte == NULL)
 					return (EINVAL);
 				pte = pmap_l1_to_l2(pte, tmpva);
 			case 2:
 				newpte = pmap_demote_l2(kernel_pmap, pte,
 				    tmpva);
 				if (newpte == NULL)
 					return (EINVAL);
 				pte = pmap_l2_to_l3(pte, tmpva);
 			case 3:
 				/* Update the entry */
 				l3 = pmap_load(pte);
 				l3 &= ~ATTR_IDX_MASK;
 				l3 |= ATTR_IDX(mode);
 				if (mode == DEVICE_MEMORY)
 					l3 |= ATTR_XN;
 
 				pmap_update_entry(kernel_pmap, pte, l3, tmpva,
 				    PAGE_SIZE);
 
 				/*
 				 * If moving to a non-cacheable entry flush
 				 * the cache.
 				 */
 				if (mode == VM_MEMATTR_UNCACHEABLE)
 					cpu_dcache_wbinv_range(tmpva, L3_SIZE);
 
 				break;
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Create an L2 table to map all addresses within an L1 mapping.
  */
 static pt_entry_t *
 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
 {
 	pt_entry_t *l2, newl2, oldl1;
 	vm_offset_t tmpl1;
 	vm_paddr_t l2phys, phys;
 	vm_page_t ml2;
 	int i;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldl1 = pmap_load(l1);
 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
 	    ("pmap_demote_l1: Demoting a non-block entry"));
 	KASSERT((va & L1_OFFSET) == 0,
 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
 
 	tmpl1 = 0;
 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
 		tmpl1 = kva_alloc(PAGE_SIZE);
 		if (tmpl1 == 0)
 			return (NULL);
 	}
 
 	if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (NULL);
 	}
 
 	l2phys = VM_PAGE_TO_PHYS(ml2);
 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
 
 	/* Address the range points at */
 	phys = oldl1 & ~ATTR_MASK;
 	/* The attributed from the old l1 table to be copied */
 	newl2 = oldl1 & ATTR_MASK;
 
 	/* Create the new entries */
 	for (i = 0; i < Ln_ENTRIES; i++) {
 		l2[i] = newl2 | phys;
 		phys += L2_SIZE;
 	}
 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
 	    ("Invalid l2 page (%lx != %lx)", l2[0],
 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
 
 	if (tmpl1 != 0) {
 		pmap_kenter(tmpl1, PAGE_SIZE,
 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY);
 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
 	}
 
 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
 
 	if (tmpl1 != 0) {
 		pmap_kremove(tmpl1);
 		kva_free(tmpl1, PAGE_SIZE);
 	}
 
 	return (l2);
 }
 
 static void
 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
 {
 	pt_entry_t *l3;
 
 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
 		*l3 = newl3;
 		newl3 += L3_SIZE;
 	}
 }
 
 static void
 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
     struct rwlock **lockp)
 {
 	struct spglist free;
 
 	SLIST_INIT(&free);
 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
 	    lockp);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * Create an L3 table to map all addresses within an L2 mapping.
  */
 static pt_entry_t *
 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pt_entry_t *l3, newl3, oldl2;
 	vm_offset_t tmpl2;
 	vm_paddr_t l3phys;
 	vm_page_t ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	l3 = NULL;
 	oldl2 = pmap_load(l2);
 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_demote_l2: Demoting a non-block entry"));
 	va &= ~L2_OFFSET;
 
 	tmpl2 = 0;
 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
 		tmpl2 = kva_alloc(PAGE_SIZE);
 		if (tmpl2 == 0)
 			return (NULL);
 	}
 
 	/*
 	 * Invalidate the 2MB page mapping and return "failure" if the
 	 * mapping was never accessed.
 	 */
 	if ((oldl2 & ATTR_AF) == 0) {
 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
 		pmap_demote_l2_abort(pmap, va, l2, lockp);
 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
 		    va, pmap);
 		goto fail;
 	}
 
 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
 		    ("pmap_demote_l2: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * If the page table page is missing and the mapping
 		 * is for a kernel address, the mapping must belong to
 		 * the direct map.  Page table pages are preallocated
 		 * for every other part of the kernel address space,
 		 * so the direct map region is the only part of the
 		 * kernel address space that must be handled here.
 		 */
 		KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va),
 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
 
 		/*
 		 * If the 2MB page mapping belongs to the direct map
 		 * region of the kernel's address space, then the page
 		 * allocation request specifies the highest possible
 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
 		 * priority is normal.
 		 */
 		ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va),
 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 
 		/*
 		 * If the allocation of the new page table page fails,
 		 * invalidate the 2MB page mapping and return "failure".
 		 */
 		if (ml3 == NULL) {
 			pmap_demote_l2_abort(pmap, va, l2, lockp);
 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			goto fail;
 		}
 
 		if (va < VM_MAXUSER_ADDRESS) {
 			ml3->wire_count = NL3PG;
 			pmap_resident_count_inc(pmap, 1);
 		}
 	}
 	l3phys = VM_PAGE_TO_PHYS(ml3);
 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
 	newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
 	KASSERT((oldl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) !=
 	    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM),
 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * or the mapping attributes have changed, (re)initialize the L3 table.
 	 *
 	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
 	 * performs a dsb().  That dsb() ensures that the stores for filling
 	 * "l3" are visible before "l3" is added to the page table.
 	 */
 	if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK))
 		pmap_fill_l3(l3, newl3);
 
 	/*
 	 * Map the temporary page so we don't lose access to the l2 table.
 	 */
 	if (tmpl2 != 0) {
 		pmap_kenter(tmpl2, PAGE_SIZE,
 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY);
 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
 	}
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
 	 * of the L2 and the PV lists will be inconsistent, which can result
 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
 	 * PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
 
 	/*
 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
 	 * the 2MB page mapping.
 	 */
 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
 		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
 
 	atomic_add_long(&pmap_l2_demotions, 1);
 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
 	    " in pmap %p %lx", va, pmap, l3[0]);
 
 fail:
 	if (tmpl2 != 0) {
 		pmap_kremove(tmpl2);
 		kva_free(tmpl2, PAGE_SIZE);
 	}
 
 	return (l3);
 
 }
 
 static pt_entry_t *
 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
 {
 	struct rwlock *lock;
 	pt_entry_t *l3;
 
 	lock = NULL;
 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (l3);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pt_entry_t *pte, tpte;
 	vm_paddr_t mask, pa;
 	int lvl, val;
 	bool managed;
 
 	PMAP_LOCK(pmap);
 retry:
 	val = 0;
 	pte = pmap_pte(pmap, addr, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 
 		switch (lvl) {
 		case 3:
 			mask = L3_OFFSET;
 			break;
 		case 2:
 			mask = L2_OFFSET;
 			break;
 		case 1:
 			mask = L1_OFFSET;
 			break;
 		default:
 			panic("pmap_mincore: invalid level %d", lvl);
 		}
 
 		managed = (tpte & ATTR_SW_MANAGED) != 0;
 		val = MINCORE_INCORE;
 		if (lvl != 3)
 			val |= MINCORE_SUPER;
 		if ((managed && pmap_pte_dirty(tpte)) || (!managed &&
 		    (tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((tpte & ATTR_AF) == ATTR_AF)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 
 		pa = (tpte & ~ATTR_MASK) | (addr & mask);
 	} else
 		managed = false;
 
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 
 	return (val);
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0);
 	__asm __volatile(
 	    "msr ttbr0_el1, %0	\n"
 	    "isb		\n"
 	    : : "r"(td->td_proc->p_md.md_l0addr));
 	pmap_invalidate_all(pmap);
 	critical_exit();
 }
 
 struct pcb *
 pmap_switch(struct thread *old, struct thread *new)
 {
 	pcpu_bp_harden bp_harden;
 	struct pcb *pcb;
 
 	/* Store the new curthread */
 	PCPU_SET(curthread, new);
 
 	/* And the new pcb */
 	pcb = new->td_pcb;
 	PCPU_SET(curpcb, pcb);
 
 	/*
 	 * TODO: We may need to flush the cache here if switching
 	 * to a user process.
 	 */
 
 	if (old == NULL ||
 	    old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) {
 		__asm __volatile(
 		    /* Switch to the new pmap */
 		    "msr	ttbr0_el1, %0	\n"
 		    "isb			\n"
 
 		    /* Invalidate the TLB */
 		    "dsb	ishst		\n"
 		    "tlbi	vmalle1is	\n"
 		    "dsb	ish		\n"
 		    "isb			\n"
 		    : : "r"(new->td_proc->p_md.md_l0addr));
 
 		/*
 		 * Stop userspace from training the branch predictor against
 		 * other processes. This will call into a CPU specific
 		 * function that clears the branch predictor state.
 		 */
 		bp_harden = PCPU_GET(bp_harden);
 		if (bp_harden != NULL)
 			bp_harden();
 	}
 
 	return (pcb);
 }
 
 void
 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
 {
 
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		cpu_icache_sync_range(va, sz);
 	} else {
 		u_int len, offset;
 		vm_paddr_t pa;
 
 		/* Find the length of data in this page to flush */
 		offset = va & PAGE_MASK;
 		len = imin(PAGE_SIZE - offset, sz);
 
 		while (sz != 0) {
 			/* Extract the physical address & find it in the DMAP */
 			pa = pmap_extract(pmap, va);
 			if (pa != 0)
 				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
 
 			/* Move to the next page */
 			sz -= len;
 			va += len;
 			/* Set the length for the next iteration */
 			len = imin(PAGE_SIZE, sz);
 		}
 	}
 }
 
 int
 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
 {
 	pt_entry_t pte, *ptep;
 	register_t intr;
 	uint64_t ec, par;
 	int lvl, rv;
 
 	rv = KERN_FAILURE;
 
 	ec = ESR_ELx_EXCEPTION(esr);
 	switch (ec) {
 	case EXCP_INSN_ABORT_L:
 	case EXCP_INSN_ABORT:
 	case EXCP_DATA_ABORT_L:
 	case EXCP_DATA_ABORT:
 		break;
 	default:
 		return (rv);
 	}
 
 	/* Data and insn aborts use same encoding for FSC field. */
 	switch (esr & ISS_DATA_DFSC_MASK) {
 	case ISS_DATA_DFSC_AFF_L1:
 	case ISS_DATA_DFSC_AFF_L2:
 	case ISS_DATA_DFSC_AFF_L3:
 		PMAP_LOCK(pmap);
 		ptep = pmap_pte(pmap, far, &lvl);
 		if (ptep != NULL) {
 			pmap_set_bits(ptep, ATTR_AF);
 			rv = KERN_SUCCESS;
 			/*
 			 * XXXMJ as an optimization we could mark the entry
 			 * dirty if this is a write fault.
 			 */
 		}
 		PMAP_UNLOCK(pmap);
 		break;
 	case ISS_DATA_DFSC_PF_L1:
 	case ISS_DATA_DFSC_PF_L2:
 	case ISS_DATA_DFSC_PF_L3:
 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
 		    (esr & ISS_DATA_WnR) == 0)
 			return (rv);
 		PMAP_LOCK(pmap);
 		ptep = pmap_pte(pmap, far, &lvl);
 		if (ptep != NULL &&
 		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
 			if ((pte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RO)) {
 				pmap_clear_bits(ptep, ATTR_AP_RW_BIT);
 				pmap_invalidate_page(pmap, far);
 			}
 			rv = KERN_SUCCESS;
 		}
 		PMAP_UNLOCK(pmap);
 		break;
 	case ISS_DATA_DFSC_TF_L0:
 	case ISS_DATA_DFSC_TF_L1:
 	case ISS_DATA_DFSC_TF_L2:
 	case ISS_DATA_DFSC_TF_L3:
-		PMAP_LOCK(pmap);
-		/* Ask the MMU to check the address */
-		intr = intr_disable();
-		if (pmap == kernel_pmap)
-			par = arm64_address_translate_s1e1r(far);
-		else
-			par = arm64_address_translate_s1e0r(far);
-		intr_restore(intr);
-		PMAP_UNLOCK(pmap);
-
 		/*
-		 * If the translation was successful the address was invalid
-		 * due to a break-before-make sequence. We can unlock and
-		 * return success to the trap handler.
+		 * Retry the translation.  A break-before-make sequence can
+		 * produce a transient fault.
 		 */
-		if (PAR_SUCCESS(par))
-			rv = KERN_SUCCESS;
+		if (pmap == kernel_pmap) {
+			/*
+			 * The translation fault may have occurred within a
+			 * critical section.  Therefore, we must check the
+			 * address without acquiring the kernel pmap's lock.
+			 */
+			if (pmap_kextract(far) != 0)
+				rv = KERN_SUCCESS;
+		} else {
+			PMAP_LOCK(pmap);
+			/* Ask the MMU to check the address. */
+			intr = intr_disable();
+			par = arm64_address_translate_s1e0r(far);
+			intr_restore(intr);
+			PMAP_UNLOCK(pmap);
+
+			/*
+			 * If the translation was successful, then we can
+			 * return success to the trap handler.
+			 */
+			if (PAR_SUCCESS(par))
+				rv = KERN_SUCCESS;
+		}
 		break;
 	}
 
 	return (rv);
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < L2_SIZE)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & L2_OFFSET;
 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
 	    (*addr & L2_OFFSET) == superpage_offset)
 		return;
 	if ((*addr & L2_OFFSET) < superpage_offset)
 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
 	else
 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	int error, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (!PHYS_IN_DMAP(paddr)) {
 			panic(
 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (!PHYS_IN_DMAP(paddr)) {
 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
 		}
 	}
 }
 
 boolean_t
 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
 }
Index: projects/clang900-import/sys/dev/ioat/ioat.c
===================================================================
--- projects/clang900-import/sys/dev/ioat/ioat.c	(revision 352586)
+++ projects/clang900-import/sys/dev/ioat/ioat.c	(revision 352587)
@@ -1,2211 +1,2218 @@
 /*-
  * Copyright (C) 2012 Intel Corporation
  * All rights reserved.
  * Copyright (C) 2018 Alexander Motin <mav@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
+#include <sys/domainset.h>
 #include <sys/fail.h>
 #include <sys/ioccom.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <machine/stdarg.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include "ioat.h"
 #include "ioat_hw.h"
 #include "ioat_internal.h"
 
 #ifndef	BUS_SPACE_MAXADDR_40BIT
 #define	BUS_SPACE_MAXADDR_40BIT	0xFFFFFFFFFFULL
 #endif
 
 static int ioat_probe(device_t device);
 static int ioat_attach(device_t device);
 static int ioat_detach(device_t device);
 static int ioat_setup_intr(struct ioat_softc *ioat);
 static int ioat_teardown_intr(struct ioat_softc *ioat);
 static int ioat3_attach(device_t device);
 static int ioat_start_channel(struct ioat_softc *ioat);
 static int ioat_map_pci_bar(struct ioat_softc *ioat);
 static void ioat_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg,
     int error);
 static void ioat_interrupt_handler(void *arg);
 static boolean_t ioat_model_resets_msix(struct ioat_softc *ioat);
 static int chanerr_to_errno(uint32_t);
 static void ioat_process_events(struct ioat_softc *ioat, boolean_t intr);
 static inline uint32_t ioat_get_active(struct ioat_softc *ioat);
 static inline uint32_t ioat_get_ring_space(struct ioat_softc *ioat);
 static void ioat_free_ring(struct ioat_softc *, uint32_t size,
     struct ioat_descriptor *);
 static int ioat_reserve_space(struct ioat_softc *, uint32_t, int mflags);
 static union ioat_hw_descriptor *ioat_get_descriptor(struct ioat_softc *,
     uint32_t index);
 static struct ioat_descriptor *ioat_get_ring_entry(struct ioat_softc *,
     uint32_t index);
 static void ioat_halted_debug(struct ioat_softc *, uint32_t);
 static void ioat_poll_timer_callback(void *arg);
 static void dump_descriptor(void *hw_desc);
 static void ioat_submit_single(struct ioat_softc *ioat);
 static void ioat_comp_update_map(void *arg, bus_dma_segment_t *seg, int nseg,
     int error);
 static int ioat_reset_hw(struct ioat_softc *ioat);
 static void ioat_reset_hw_task(void *, int);
 static void ioat_setup_sysctl(device_t device);
 static int sysctl_handle_reset(SYSCTL_HANDLER_ARGS);
 static void ioat_get(struct ioat_softc *);
 static void ioat_put(struct ioat_softc *);
 static void ioat_drain_locked(struct ioat_softc *);
 
 #define	ioat_log_message(v, ...) do {					\
 	if ((v) <= g_ioat_debug_level) {				\
 		device_printf(ioat->device, __VA_ARGS__);		\
 	}								\
 } while (0)
 
 MALLOC_DEFINE(M_IOAT, "ioat", "ioat driver memory allocations");
 SYSCTL_NODE(_hw, OID_AUTO, ioat, CTLFLAG_RD, 0, "ioat node");
 
 static int g_force_legacy_interrupts;
 SYSCTL_INT(_hw_ioat, OID_AUTO, force_legacy_interrupts, CTLFLAG_RDTUN,
     &g_force_legacy_interrupts, 0, "Set to non-zero to force MSI-X disabled");
 
 int g_ioat_debug_level = 0;
 SYSCTL_INT(_hw_ioat, OID_AUTO, debug_level, CTLFLAG_RWTUN, &g_ioat_debug_level,
     0, "Set log level (0-3) for ioat(4). Higher is more verbose.");
 
 unsigned g_ioat_ring_order = 13;
 SYSCTL_UINT(_hw_ioat, OID_AUTO, ring_order, CTLFLAG_RDTUN, &g_ioat_ring_order,
     0, "Set IOAT ring order.  (1 << this) == ring size.");
 
 /*
  * OS <-> Driver interface structures
  */
 static device_method_t ioat_pci_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,     ioat_probe),
 	DEVMETHOD(device_attach,    ioat_attach),
 	DEVMETHOD(device_detach,    ioat_detach),
 	DEVMETHOD_END
 };
 
 static driver_t ioat_pci_driver = {
 	"ioat",
 	ioat_pci_methods,
 	sizeof(struct ioat_softc),
 };
 
 static devclass_t ioat_devclass;
 DRIVER_MODULE(ioat, pci, ioat_pci_driver, ioat_devclass, 0, 0);
 MODULE_VERSION(ioat, 1);
 
 /*
  * Private data structures
  */
 static struct ioat_softc *ioat_channel[IOAT_MAX_CHANNELS];
 static unsigned ioat_channel_index = 0;
 SYSCTL_UINT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0,
     "Number of IOAT channels attached");
 static struct mtx ioat_list_mtx;
 MTX_SYSINIT(ioat_list_mtx, &ioat_list_mtx, "ioat list mtx", MTX_DEF);
 
 static struct _pcsid
 {
 	u_int32_t   type;
 	const char  *desc;
 } pci_ids[] = {
 	{ 0x34308086, "TBG IOAT Ch0" },
 	{ 0x34318086, "TBG IOAT Ch1" },
 	{ 0x34328086, "TBG IOAT Ch2" },
 	{ 0x34338086, "TBG IOAT Ch3" },
 	{ 0x34298086, "TBG IOAT Ch4" },
 	{ 0x342a8086, "TBG IOAT Ch5" },
 	{ 0x342b8086, "TBG IOAT Ch6" },
 	{ 0x342c8086, "TBG IOAT Ch7" },
 
 	{ 0x37108086, "JSF IOAT Ch0" },
 	{ 0x37118086, "JSF IOAT Ch1" },
 	{ 0x37128086, "JSF IOAT Ch2" },
 	{ 0x37138086, "JSF IOAT Ch3" },
 	{ 0x37148086, "JSF IOAT Ch4" },
 	{ 0x37158086, "JSF IOAT Ch5" },
 	{ 0x37168086, "JSF IOAT Ch6" },
 	{ 0x37178086, "JSF IOAT Ch7" },
 	{ 0x37188086, "JSF IOAT Ch0 (RAID)" },
 	{ 0x37198086, "JSF IOAT Ch1 (RAID)" },
 
 	{ 0x3c208086, "SNB IOAT Ch0" },
 	{ 0x3c218086, "SNB IOAT Ch1" },
 	{ 0x3c228086, "SNB IOAT Ch2" },
 	{ 0x3c238086, "SNB IOAT Ch3" },
 	{ 0x3c248086, "SNB IOAT Ch4" },
 	{ 0x3c258086, "SNB IOAT Ch5" },
 	{ 0x3c268086, "SNB IOAT Ch6" },
 	{ 0x3c278086, "SNB IOAT Ch7" },
 	{ 0x3c2e8086, "SNB IOAT Ch0 (RAID)" },
 	{ 0x3c2f8086, "SNB IOAT Ch1 (RAID)" },
 
 	{ 0x0e208086, "IVB IOAT Ch0" },
 	{ 0x0e218086, "IVB IOAT Ch1" },
 	{ 0x0e228086, "IVB IOAT Ch2" },
 	{ 0x0e238086, "IVB IOAT Ch3" },
 	{ 0x0e248086, "IVB IOAT Ch4" },
 	{ 0x0e258086, "IVB IOAT Ch5" },
 	{ 0x0e268086, "IVB IOAT Ch6" },
 	{ 0x0e278086, "IVB IOAT Ch7" },
 	{ 0x0e2e8086, "IVB IOAT Ch0 (RAID)" },
 	{ 0x0e2f8086, "IVB IOAT Ch1 (RAID)" },
 
 	{ 0x2f208086, "HSW IOAT Ch0" },
 	{ 0x2f218086, "HSW IOAT Ch1" },
 	{ 0x2f228086, "HSW IOAT Ch2" },
 	{ 0x2f238086, "HSW IOAT Ch3" },
 	{ 0x2f248086, "HSW IOAT Ch4" },
 	{ 0x2f258086, "HSW IOAT Ch5" },
 	{ 0x2f268086, "HSW IOAT Ch6" },
 	{ 0x2f278086, "HSW IOAT Ch7" },
 	{ 0x2f2e8086, "HSW IOAT Ch0 (RAID)" },
 	{ 0x2f2f8086, "HSW IOAT Ch1 (RAID)" },
 
 	{ 0x0c508086, "BWD IOAT Ch0" },
 	{ 0x0c518086, "BWD IOAT Ch1" },
 	{ 0x0c528086, "BWD IOAT Ch2" },
 	{ 0x0c538086, "BWD IOAT Ch3" },
 
 	{ 0x6f508086, "BDXDE IOAT Ch0" },
 	{ 0x6f518086, "BDXDE IOAT Ch1" },
 	{ 0x6f528086, "BDXDE IOAT Ch2" },
 	{ 0x6f538086, "BDXDE IOAT Ch3" },
 
 	{ 0x6f208086, "BDX IOAT Ch0" },
 	{ 0x6f218086, "BDX IOAT Ch1" },
 	{ 0x6f228086, "BDX IOAT Ch2" },
 	{ 0x6f238086, "BDX IOAT Ch3" },
 	{ 0x6f248086, "BDX IOAT Ch4" },
 	{ 0x6f258086, "BDX IOAT Ch5" },
 	{ 0x6f268086, "BDX IOAT Ch6" },
 	{ 0x6f278086, "BDX IOAT Ch7" },
 	{ 0x6f2e8086, "BDX IOAT Ch0 (RAID)" },
 	{ 0x6f2f8086, "BDX IOAT Ch1 (RAID)" },
 
 	{ 0x20218086, "SKX IOAT" },
 };
 
 MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ioat, pci_ids,
     nitems(pci_ids));
 
 /*
  * OS <-> Driver linkage functions
  */
 static int
 ioat_probe(device_t device)
 {
 	struct _pcsid *ep;
 	u_int32_t type;
 
 	type = pci_get_devid(device);
 	for (ep = pci_ids; ep < &pci_ids[nitems(pci_ids)]; ep++) {
 		if (ep->type == type) {
 			device_set_desc(device, ep->desc);
 			return (0);
 		}
 	}
 	return (ENXIO);
 }
 
 static int
 ioat_attach(device_t device)
 {
 	struct ioat_softc *ioat;
 	int error, i;
 
 	ioat = DEVICE2SOFTC(device);
 	ioat->device = device;
+	if (bus_get_domain(device, &ioat->domain) != 0)
+		ioat->domain = 0;
+	ioat->cpu = CPU_FFS(&cpuset_domain[ioat->domain]) - 1;
+	if (ioat->cpu < 0)
+		ioat->cpu = CPU_FIRST();
 
 	error = ioat_map_pci_bar(ioat);
 	if (error != 0)
 		goto err;
 
 	ioat->version = ioat_read_cbver(ioat);
 	if (ioat->version < IOAT_VER_3_0) {
 		error = ENODEV;
 		goto err;
 	}
 
 	error = ioat3_attach(device);
 	if (error != 0)
 		goto err;
 
 	error = pci_enable_busmaster(device);
 	if (error != 0)
 		goto err;
 
 	error = ioat_setup_intr(ioat);
 	if (error != 0)
 		goto err;
 
 	error = ioat_reset_hw(ioat);
 	if (error != 0)
 		goto err;
 
 	ioat_process_events(ioat, FALSE);
 	ioat_setup_sysctl(device);
 
 	mtx_lock(&ioat_list_mtx);
 	for (i = 0; i < IOAT_MAX_CHANNELS; i++) {
 		if (ioat_channel[i] == NULL)
 			break;
 	}
 	if (i >= IOAT_MAX_CHANNELS) {
 		mtx_unlock(&ioat_list_mtx);
 		device_printf(device, "Too many I/OAT devices in system\n");
 		error = ENXIO;
 		goto err;
 	}
 	ioat->chan_idx = i;
 	ioat_channel[i] = ioat;
 	if (i >= ioat_channel_index)
 		ioat_channel_index = i + 1;
 	mtx_unlock(&ioat_list_mtx);
 
 	ioat_test_attach();
 
 err:
 	if (error != 0)
 		ioat_detach(device);
 	return (error);
 }
 
 static inline int
 ioat_bus_dmamap_destroy(struct ioat_softc *ioat, const char *func,
     bus_dma_tag_t dmat, bus_dmamap_t map)
 {
 	int error;
 
 	error = bus_dmamap_destroy(dmat, map);
 	if (error != 0) {
 		ioat_log_message(0,
 		    "%s: bus_dmamap_destroy failed %d\n", func, error);
 	}
 
 	return (error);
 }
 
 static int
 ioat_detach(device_t device)
 {
 	struct ioat_softc *ioat;
 	int i, error;
 
 	ioat = DEVICE2SOFTC(device);
 
 	mtx_lock(&ioat_list_mtx);
 	ioat_channel[ioat->chan_idx] = NULL;
 	while (ioat_channel_index > 0 &&
 	    ioat_channel[ioat_channel_index - 1] == NULL)
 		ioat_channel_index--;
 	mtx_unlock(&ioat_list_mtx);
 
 	ioat_test_detach();
 	taskqueue_drain(taskqueue_thread, &ioat->reset_task);
 
 	mtx_lock(&ioat->submit_lock);
 	ioat->quiescing = TRUE;
 	ioat->destroying = TRUE;
 	wakeup(&ioat->quiescing);
 	wakeup(&ioat->resetting);
 
 	ioat_drain_locked(ioat);
 	mtx_unlock(&ioat->submit_lock);
 	mtx_lock(&ioat->cleanup_lock);
 	while (ioat_get_active(ioat) > 0)
 		msleep(&ioat->tail, &ioat->cleanup_lock, 0, "ioat_drain", 1);
 	mtx_unlock(&ioat->cleanup_lock);
 
 	ioat_teardown_intr(ioat);
 	callout_drain(&ioat->poll_timer);
 
 	pci_disable_busmaster(device);
 
 	if (ioat->pci_resource != NULL)
 		bus_release_resource(device, SYS_RES_MEMORY,
 		    ioat->pci_resource_id, ioat->pci_resource);
 
 	if (ioat->data_tag != NULL) {
 		for (i = 0; i < 1 << ioat->ring_size_order; i++) {
 			error = ioat_bus_dmamap_destroy(ioat, __func__,
 			    ioat->data_tag, ioat->ring[i].src_dmamap);
 			if (error != 0)
 				return (error);
 		}
 		for (i = 0; i < 1 << ioat->ring_size_order; i++) {
 			error = ioat_bus_dmamap_destroy(ioat, __func__,
 			    ioat->data_tag, ioat->ring[i].dst_dmamap);
 			if (error != 0)
 				return (error);
 		}
 
 		for (i = 0; i < 1 << ioat->ring_size_order; i++) {
 			error = ioat_bus_dmamap_destroy(ioat, __func__,
 			    ioat->data_tag, ioat->ring[i].src2_dmamap);
 			if (error != 0)
 				return (error);
 		}
 		for (i = 0; i < 1 << ioat->ring_size_order; i++) {
 			error = ioat_bus_dmamap_destroy(ioat, __func__,
 			    ioat->data_tag, ioat->ring[i].dst2_dmamap);
 			if (error != 0)
 				return (error);
 		}
 
 		bus_dma_tag_destroy(ioat->data_tag);
 	}
 
 	if (ioat->data_crc_tag != NULL) {
 		for (i = 0; i < 1 << ioat->ring_size_order; i++) {
 			error = ioat_bus_dmamap_destroy(ioat, __func__,
 			    ioat->data_crc_tag, ioat->ring[i].crc_dmamap);
 			if (error != 0)
 				return (error);
 		}
 
 		bus_dma_tag_destroy(ioat->data_crc_tag);
 	}
 
 	if (ioat->ring != NULL)
 		ioat_free_ring(ioat, 1 << ioat->ring_size_order, ioat->ring);
 
 	if (ioat->comp_update != NULL) {
 		bus_dmamap_unload(ioat->comp_update_tag, ioat->comp_update_map);
 		bus_dmamem_free(ioat->comp_update_tag, ioat->comp_update,
 		    ioat->comp_update_map);
 		bus_dma_tag_destroy(ioat->comp_update_tag);
 	}
 
 	if (ioat->hw_desc_ring != NULL) {
 		bus_dmamap_unload(ioat->hw_desc_tag, ioat->hw_desc_map);
 		bus_dmamem_free(ioat->hw_desc_tag, ioat->hw_desc_ring,
 		    ioat->hw_desc_map);
 		bus_dma_tag_destroy(ioat->hw_desc_tag);
 	}
 
 	return (0);
 }
 
 static int
 ioat_teardown_intr(struct ioat_softc *ioat)
 {
 
 	if (ioat->tag != NULL)
 		bus_teardown_intr(ioat->device, ioat->res, ioat->tag);
 
 	if (ioat->res != NULL)
 		bus_release_resource(ioat->device, SYS_RES_IRQ,
 		    rman_get_rid(ioat->res), ioat->res);
 
 	pci_release_msi(ioat->device);
 	return (0);
 }
 
 static int
 ioat_start_channel(struct ioat_softc *ioat)
 {
 	struct ioat_dma_hw_descriptor *hw_desc;
 	struct ioat_descriptor *desc;
 	struct bus_dmadesc *dmadesc;
 	uint64_t status;
 	uint32_t chanerr;
 	int i;
 
 	ioat_acquire(&ioat->dmaengine);
 
 	/* Submit 'NULL' operation manually to avoid quiescing flag */
 	desc = ioat_get_ring_entry(ioat, ioat->head);
 	hw_desc = &ioat_get_descriptor(ioat, ioat->head)->dma;
 	dmadesc = &desc->bus_dmadesc;
 
 	dmadesc->callback_fn = NULL;
 	dmadesc->callback_arg = NULL;
 
 	hw_desc->u.control_raw = 0;
 	hw_desc->u.control_generic.op = IOAT_OP_COPY;
 	hw_desc->u.control_generic.completion_update = 1;
 	hw_desc->size = 8;
 	hw_desc->src_addr = 0;
 	hw_desc->dest_addr = 0;
 	hw_desc->u.control.null = 1;
 
 	ioat_submit_single(ioat);
 	ioat_release(&ioat->dmaengine);
 
 	for (i = 0; i < 100; i++) {
 		DELAY(1);
 		status = ioat_get_chansts(ioat);
 		if (is_ioat_idle(status))
 			return (0);
 	}
 
 	chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET);
 	ioat_log_message(0, "could not start channel: "
 	    "status = %#jx error = %b\n", (uintmax_t)status, (int)chanerr,
 	    IOAT_CHANERR_STR);
 	return (ENXIO);
 }
 
 /*
  * Initialize Hardware
  */
 static int
 ioat3_attach(device_t device)
 {
 	struct ioat_softc *ioat;
 	struct ioat_descriptor *ring;
 	struct ioat_dma_hw_descriptor *dma_hw_desc;
 	void *hw_desc;
 	size_t ringsz;
 	int i, num_descriptors;
 	int error;
 	uint8_t xfercap;
 
 	error = 0;
 	ioat = DEVICE2SOFTC(device);
 	ioat->capabilities = ioat_read_dmacapability(ioat);
 
 	ioat_log_message(0, "Capabilities: %b\n", (int)ioat->capabilities,
 	    IOAT_DMACAP_STR);
 
 	xfercap = ioat_read_xfercap(ioat);
 	ioat->max_xfer_size = 1 << xfercap;
 
 	ioat->intrdelay_supported = (ioat_read_2(ioat, IOAT_INTRDELAY_OFFSET) &
 	    IOAT_INTRDELAY_SUPPORTED) != 0;
 	if (ioat->intrdelay_supported)
 		ioat->intrdelay_max = IOAT_INTRDELAY_US_MASK;
 
 	/* TODO: need to check DCA here if we ever do XOR/PQ */
 
 	mtx_init(&ioat->submit_lock, "ioat_submit", NULL, MTX_DEF);
 	mtx_init(&ioat->cleanup_lock, "ioat_cleanup", NULL, MTX_DEF);
 	callout_init(&ioat->poll_timer, 1);
 	TASK_INIT(&ioat->reset_task, 0, ioat_reset_hw_task, ioat);
 
 	/* Establish lock order for Witness */
 	mtx_lock(&ioat->cleanup_lock);
 	mtx_lock(&ioat->submit_lock);
 	mtx_unlock(&ioat->submit_lock);
 	mtx_unlock(&ioat->cleanup_lock);
 
 	ioat->is_submitter_processing = FALSE;
 
 	bus_dma_tag_create(bus_get_dma_tag(ioat->device), sizeof(uint64_t), 0x0,
 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
 	    sizeof(uint64_t), 1, sizeof(uint64_t), 0, NULL, NULL,
 	    &ioat->comp_update_tag);
 
 	error = bus_dmamem_alloc(ioat->comp_update_tag,
 	    (void **)&ioat->comp_update, BUS_DMA_ZERO, &ioat->comp_update_map);
 	if (ioat->comp_update == NULL)
 		return (ENOMEM);
 
 	error = bus_dmamap_load(ioat->comp_update_tag, ioat->comp_update_map,
 	    ioat->comp_update, sizeof(uint64_t), ioat_comp_update_map, ioat,
 	    0);
 	if (error != 0)
 		return (error);
 
 	ioat->ring_size_order = g_ioat_ring_order;
 	num_descriptors = 1 << ioat->ring_size_order;
 	ringsz = sizeof(struct ioat_dma_hw_descriptor) * num_descriptors;
 
 	error = bus_dma_tag_create(bus_get_dma_tag(ioat->device),
 	    2 * 1024 * 1024, 0x0, (bus_addr_t)BUS_SPACE_MAXADDR_40BIT,
 	    BUS_SPACE_MAXADDR, NULL, NULL, ringsz, 1, ringsz, 0, NULL, NULL,
 	    &ioat->hw_desc_tag);
 	if (error != 0)
 		return (error);
 
 	error = bus_dmamem_alloc(ioat->hw_desc_tag, &hw_desc,
 	    BUS_DMA_ZERO | BUS_DMA_WAITOK, &ioat->hw_desc_map);
 	if (error != 0)
 		return (error);
 
 	error = bus_dmamap_load(ioat->hw_desc_tag, ioat->hw_desc_map, hw_desc,
 	    ringsz, ioat_dmamap_cb, &ioat->hw_desc_bus_addr, BUS_DMA_WAITOK);
 	if (error)
 		return (error);
 
 	ioat->hw_desc_ring = hw_desc;
 
 	error = bus_dma_tag_create(bus_get_dma_tag(ioat->device),
 	    1, 0, BUS_SPACE_MAXADDR_40BIT, BUS_SPACE_MAXADDR, NULL, NULL,
 	    ioat->max_xfer_size, 1, ioat->max_xfer_size, 0, NULL, NULL,
 	    &ioat->data_crc_tag);
 	if (error != 0) {
 		ioat_log_message(0, "%s: bus_dma_tag_create failed %d\n",
 		    __func__, error);
 		return (error);
 	}
 
 	error = bus_dma_tag_create(bus_get_dma_tag(ioat->device),
 	    1, 0, BUS_SPACE_MAXADDR_48BIT, BUS_SPACE_MAXADDR, NULL, NULL,
 	    ioat->max_xfer_size, 1, ioat->max_xfer_size, 0, NULL, NULL,
 	    &ioat->data_tag);
 	if (error != 0) {
 		ioat_log_message(0, "%s: bus_dma_tag_create failed %d\n",
 		    __func__, error);
 		return (error);
 	}
-	ioat->ring = malloc(num_descriptors * sizeof(*ring), M_IOAT,
-	    M_ZERO | M_WAITOK);
+	ioat->ring = malloc_domainset(num_descriptors * sizeof(*ring), M_IOAT,
+	    DOMAINSET_PREF(ioat->domain), M_ZERO | M_WAITOK);
 
 	ring = ioat->ring;
 	for (i = 0; i < num_descriptors; i++) {
 		memset(&ring[i].bus_dmadesc, 0, sizeof(ring[i].bus_dmadesc));
 		ring[i].id = i;
 		error = bus_dmamap_create(ioat->data_tag, 0,
                     &ring[i].src_dmamap);
 		if (error != 0) {
 			ioat_log_message(0,
 			    "%s: bus_dmamap_create failed %d\n", __func__,
 			    error);
 			return (error);
 		}
 		error = bus_dmamap_create(ioat->data_tag, 0,
                     &ring[i].dst_dmamap);
 		if (error != 0) {
 			ioat_log_message(0,
 			    "%s: bus_dmamap_create failed %d\n", __func__,
 			    error);
 			return (error);
 		}
 		error = bus_dmamap_create(ioat->data_tag, 0,
                     &ring[i].src2_dmamap);
 		if (error != 0) {
 			ioat_log_message(0,
 			    "%s: bus_dmamap_create failed %d\n", __func__,
 			    error);
 			return (error);
 		}
 		error = bus_dmamap_create(ioat->data_tag, 0,
                     &ring[i].dst2_dmamap);
 		if (error != 0) {
 			ioat_log_message(0,
 			    "%s: bus_dmamap_create failed %d\n", __func__,
 			    error);
 			return (error);
 		}
 		error = bus_dmamap_create(ioat->data_crc_tag, 0,
                     &ring[i].crc_dmamap);
 		if (error != 0) {
 			ioat_log_message(0,
 			    "%s: bus_dmamap_create failed %d\n", __func__,
 			    error);
 			return (error);
 		}
 	}
 
 	for (i = 0; i < num_descriptors; i++) {
 		dma_hw_desc = &ioat->hw_desc_ring[i].dma;
 		dma_hw_desc->next = RING_PHYS_ADDR(ioat, i + 1);
 	}
 
 	ioat->head = 0;
 	ioat->tail = 0;
 	ioat->last_seen = 0;
 	*ioat->comp_update = 0;
 	return (0);
 }
 
 static int
 ioat_map_pci_bar(struct ioat_softc *ioat)
 {
 
 	ioat->pci_resource_id = PCIR_BAR(0);
 	ioat->pci_resource = bus_alloc_resource_any(ioat->device,
 	    SYS_RES_MEMORY, &ioat->pci_resource_id, RF_ACTIVE);
 
 	if (ioat->pci_resource == NULL) {
 		ioat_log_message(0, "unable to allocate pci resource\n");
 		return (ENODEV);
 	}
 
 	ioat->pci_bus_tag = rman_get_bustag(ioat->pci_resource);
 	ioat->pci_bus_handle = rman_get_bushandle(ioat->pci_resource);
 	return (0);
 }
 
 static void
 ioat_comp_update_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
 {
 	struct ioat_softc *ioat = arg;
 
 	KASSERT(error == 0, ("%s: error:%d", __func__, error));
 	ioat->comp_update_bus_addr = seg[0].ds_addr;
 }
 
 static void
 ioat_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	bus_addr_t *baddr;
 
 	KASSERT(error == 0, ("%s: error:%d", __func__, error));
 	baddr = arg;
 	*baddr = segs->ds_addr;
 }
 
 /*
  * Interrupt setup and handlers
  */
 static int
 ioat_setup_intr(struct ioat_softc *ioat)
 {
 	uint32_t num_vectors;
 	int error;
 	boolean_t use_msix;
 	boolean_t force_legacy_interrupts;
 
 	use_msix = FALSE;
 	force_legacy_interrupts = FALSE;
 
 	if (!g_force_legacy_interrupts && pci_msix_count(ioat->device) >= 1) {
 		num_vectors = 1;
 		pci_alloc_msix(ioat->device, &num_vectors);
 		if (num_vectors == 1)
 			use_msix = TRUE;
 	}
 
 	if (use_msix) {
 		ioat->rid = 1;
 		ioat->res = bus_alloc_resource_any(ioat->device, SYS_RES_IRQ,
 		    &ioat->rid, RF_ACTIVE);
 	} else {
 		ioat->rid = 0;
 		ioat->res = bus_alloc_resource_any(ioat->device, SYS_RES_IRQ,
 		    &ioat->rid, RF_SHAREABLE | RF_ACTIVE);
 	}
 	if (ioat->res == NULL) {
 		ioat_log_message(0, "bus_alloc_resource failed\n");
 		return (ENOMEM);
 	}
 
 	ioat->tag = NULL;
 	error = bus_setup_intr(ioat->device, ioat->res, INTR_MPSAFE |
 	    INTR_TYPE_MISC, NULL, ioat_interrupt_handler, ioat, &ioat->tag);
 	if (error != 0) {
 		ioat_log_message(0, "bus_setup_intr failed\n");
 		return (error);
 	}
 
 	ioat_write_intrctrl(ioat, IOAT_INTRCTRL_MASTER_INT_EN);
 	return (0);
 }
 
 static boolean_t
 ioat_model_resets_msix(struct ioat_softc *ioat)
 {
 	u_int32_t pciid;
 
 	pciid = pci_get_devid(ioat->device);
 	switch (pciid) {
 		/* BWD: */
 	case 0x0c508086:
 	case 0x0c518086:
 	case 0x0c528086:
 	case 0x0c538086:
 		/* BDXDE: */
 	case 0x6f508086:
 	case 0x6f518086:
 	case 0x6f528086:
 	case 0x6f538086:
 		return (TRUE);
 	}
 
 	return (FALSE);
 }
 
 static void
 ioat_interrupt_handler(void *arg)
 {
 	struct ioat_softc *ioat = arg;
 
 	ioat->stats.interrupts++;
 	ioat_process_events(ioat, TRUE);
 }
 
 static int
 chanerr_to_errno(uint32_t chanerr)
 {
 
 	if (chanerr == 0)
 		return (0);
 	if ((chanerr & (IOAT_CHANERR_XSADDERR | IOAT_CHANERR_XDADDERR)) != 0)
 		return (EFAULT);
 	if ((chanerr & (IOAT_CHANERR_RDERR | IOAT_CHANERR_WDERR)) != 0)
 		return (EIO);
 	/* This one is probably our fault: */
 	if ((chanerr & IOAT_CHANERR_NDADDERR) != 0)
 		return (EIO);
 	return (EIO);
 }
 
 static void
 ioat_process_events(struct ioat_softc *ioat, boolean_t intr)
 {
 	struct ioat_descriptor *desc;
 	struct bus_dmadesc *dmadesc;
 	uint64_t comp_update, status;
 	uint32_t completed, chanerr;
 	int error;
 
 	mtx_lock(&ioat->cleanup_lock);
 
 	/*
 	 * Don't run while the hardware is being reset.  Reset is responsible
 	 * for blocking new work and draining & completing existing work, so
 	 * there is nothing to do until new work is queued after reset anyway.
 	 */
 	if (ioat->resetting_cleanup) {
 		mtx_unlock(&ioat->cleanup_lock);
 		return;
 	}
 
 	completed = 0;
 	comp_update = *ioat->comp_update;
 	status = comp_update & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK;
 
 	if (status < ioat->hw_desc_bus_addr ||
 	    status >= ioat->hw_desc_bus_addr + (1 << ioat->ring_size_order) *
 	    sizeof(struct ioat_generic_hw_descriptor))
 		panic("Bogus completion address %jx (channel %u)",
 		    (uintmax_t)status, ioat->chan_idx);
 
 	if (status == ioat->last_seen) {
 		/*
 		 * If we landed in process_events and nothing has been
 		 * completed, check for a timeout due to channel halt.
 		 */
 		goto out;
 	}
 	CTR4(KTR_IOAT, "%s channel=%u hw_status=0x%lx last_seen=0x%lx",
 	    __func__, ioat->chan_idx, comp_update, ioat->last_seen);
 
 	while (RING_PHYS_ADDR(ioat, ioat->tail - 1) != status) {
 		desc = ioat_get_ring_entry(ioat, ioat->tail);
 		dmadesc = &desc->bus_dmadesc;
 		CTR5(KTR_IOAT, "channel=%u completing desc idx %u (%p) ok  cb %p(%p)",
 		    ioat->chan_idx, ioat->tail, dmadesc, dmadesc->callback_fn,
 		    dmadesc->callback_arg);
 
 		bus_dmamap_unload(ioat->data_tag, desc->src_dmamap);
 		bus_dmamap_unload(ioat->data_tag, desc->dst_dmamap);
 		bus_dmamap_unload(ioat->data_tag, desc->src2_dmamap);
 		bus_dmamap_unload(ioat->data_tag, desc->dst2_dmamap);
 		bus_dmamap_unload(ioat->data_crc_tag, desc->crc_dmamap);
 
 		if (dmadesc->callback_fn != NULL)
 			dmadesc->callback_fn(dmadesc->callback_arg, 0);
 
 		completed++;
 		ioat->tail++;
 	}
 	CTR5(KTR_IOAT, "%s channel=%u head=%u tail=%u active=%u", __func__,
 	    ioat->chan_idx, ioat->head, ioat->tail, ioat_get_active(ioat));
 
 	if (completed != 0) {
 		ioat->last_seen = RING_PHYS_ADDR(ioat, ioat->tail - 1);
 		ioat->stats.descriptors_processed += completed;
 		wakeup(&ioat->tail);
 	}
 
 out:
 	ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN);
 	mtx_unlock(&ioat->cleanup_lock);
 
 	/*
 	 * The device doesn't seem to reliably push suspend/halt statuses to
 	 * the channel completion memory address, so poll the device register
 	 * here.  For performance reasons skip it on interrupts, do it only
 	 * on much more rare polling events.
 	 */
 	if (!intr)
 		comp_update = ioat_get_chansts(ioat) & IOAT_CHANSTS_STATUS;
 	if (!is_ioat_halted(comp_update) && !is_ioat_suspended(comp_update))
 		return;
 
 	ioat->stats.channel_halts++;
 
 	/*
 	 * Fatal programming error on this DMA channel.  Flush any outstanding
 	 * work with error status and restart the engine.
 	 */
 	mtx_lock(&ioat->submit_lock);
 	ioat->quiescing = TRUE;
 	mtx_unlock(&ioat->submit_lock);
 
 	/*
 	 * This is safe to do here because the submit queue is quiesced.  We
 	 * know that we will drain all outstanding events, so ioat_reset_hw
 	 * can't deadlock. It is necessary to protect other ioat_process_event
 	 * threads from racing ioat_reset_hw, reading an indeterminate hw
 	 * state, and attempting to continue issuing completions.
 	 */
 	mtx_lock(&ioat->cleanup_lock);
 	ioat->resetting_cleanup = TRUE;
 
 	chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET);
 	if (1 <= g_ioat_debug_level)
 		ioat_halted_debug(ioat, chanerr);
 	ioat->stats.last_halt_chanerr = chanerr;
 
 	while (ioat_get_active(ioat) > 0) {
 		desc = ioat_get_ring_entry(ioat, ioat->tail);
 		dmadesc = &desc->bus_dmadesc;
 		CTR5(KTR_IOAT, "channel=%u completing desc idx %u (%p) err cb %p(%p)",
 		    ioat->chan_idx, ioat->tail, dmadesc, dmadesc->callback_fn,
 		    dmadesc->callback_arg);
 
 		if (dmadesc->callback_fn != NULL)
 			dmadesc->callback_fn(dmadesc->callback_arg,
 			    chanerr_to_errno(chanerr));
 
 		ioat->tail++;
 		ioat->stats.descriptors_processed++;
 		ioat->stats.descriptors_error++;
 	}
 	CTR5(KTR_IOAT, "%s channel=%u head=%u tail=%u active=%u", __func__,
 	    ioat->chan_idx, ioat->head, ioat->tail, ioat_get_active(ioat));
 
 	/* Clear error status */
 	ioat_write_4(ioat, IOAT_CHANERR_OFFSET, chanerr);
 
 	mtx_unlock(&ioat->cleanup_lock);
 
 	ioat_log_message(0, "Resetting channel to recover from error\n");
 	error = taskqueue_enqueue(taskqueue_thread, &ioat->reset_task);
 	KASSERT(error == 0,
 	    ("%s: taskqueue_enqueue failed: %d", __func__, error));
 }
 
 static void
 ioat_reset_hw_task(void *ctx, int pending __unused)
 {
 	struct ioat_softc *ioat;
 	int error;
 
 	ioat = ctx;
 	ioat_log_message(1, "%s: Resetting channel\n", __func__);
 
 	error = ioat_reset_hw(ioat);
 	KASSERT(error == 0, ("%s: reset failed: %d", __func__, error));
 	(void)error;
 }
 
 /*
  * User API functions
  */
 unsigned
 ioat_get_nchannels(void)
 {
 
 	return (ioat_channel_index);
 }
 
 bus_dmaengine_t
 ioat_get_dmaengine(uint32_t index, int flags)
 {
 	struct ioat_softc *ioat;
 
 	KASSERT((flags & ~(M_NOWAIT | M_WAITOK)) == 0,
 	    ("invalid flags: 0x%08x", flags));
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) != (M_NOWAIT | M_WAITOK),
 	    ("invalid wait | nowait"));
 
 	mtx_lock(&ioat_list_mtx);
 	if (index >= ioat_channel_index ||
 	    (ioat = ioat_channel[index]) == NULL) {
 		mtx_unlock(&ioat_list_mtx);
 		return (NULL);
 	}
 	mtx_lock(&ioat->submit_lock);
 	mtx_unlock(&ioat_list_mtx);
 
 	if (ioat->destroying) {
 		mtx_unlock(&ioat->submit_lock);
 		return (NULL);
 	}
 
 	ioat_get(ioat);
 	if (ioat->quiescing) {
 		if ((flags & M_NOWAIT) != 0) {
 			ioat_put(ioat);
 			mtx_unlock(&ioat->submit_lock);
 			return (NULL);
 		}
 
 		while (ioat->quiescing && !ioat->destroying)
 			msleep(&ioat->quiescing, &ioat->submit_lock, 0, "getdma", 0);
 
 		if (ioat->destroying) {
 			ioat_put(ioat);
 			mtx_unlock(&ioat->submit_lock);
 			return (NULL);
 		}
 	}
 	mtx_unlock(&ioat->submit_lock);
 	return (&ioat->dmaengine);
 }
 
 void
 ioat_put_dmaengine(bus_dmaengine_t dmaengine)
 {
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	mtx_lock(&ioat->submit_lock);
 	ioat_put(ioat);
 	mtx_unlock(&ioat->submit_lock);
 }
 
 int
 ioat_get_hwversion(bus_dmaengine_t dmaengine)
 {
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	return (ioat->version);
 }
 
 size_t
 ioat_get_max_io_size(bus_dmaengine_t dmaengine)
 {
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	return (ioat->max_xfer_size);
 }
 
 uint32_t
 ioat_get_capabilities(bus_dmaengine_t dmaengine)
 {
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	return (ioat->capabilities);
 }
 
 int
 ioat_set_interrupt_coalesce(bus_dmaengine_t dmaengine, uint16_t delay)
 {
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	if (!ioat->intrdelay_supported)
 		return (ENODEV);
 	if (delay > ioat->intrdelay_max)
 		return (ERANGE);
 
 	ioat_write_2(ioat, IOAT_INTRDELAY_OFFSET, delay);
 	ioat->cached_intrdelay =
 	    ioat_read_2(ioat, IOAT_INTRDELAY_OFFSET) & IOAT_INTRDELAY_US_MASK;
 	return (0);
 }
 
 uint16_t
 ioat_get_max_coalesce_period(bus_dmaengine_t dmaengine)
 {
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	return (ioat->intrdelay_max);
 }
 
 void
 ioat_acquire(bus_dmaengine_t dmaengine)
 {
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	mtx_lock(&ioat->submit_lock);
 	CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx);
 	ioat->acq_head = ioat->head;
 }
 
 int
 ioat_acquire_reserve(bus_dmaengine_t dmaengine, unsigned n, int mflags)
 {
 	struct ioat_softc *ioat;
 	int error;
 
 	ioat = to_ioat_softc(dmaengine);
 	ioat_acquire(dmaengine);
 
 	error = ioat_reserve_space(ioat, n, mflags);
 	if (error != 0)
 		ioat_release(dmaengine);
 	return (error);
 }
 
 void
 ioat_release(bus_dmaengine_t dmaengine)
 {
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	CTR3(KTR_IOAT, "%s channel=%u dispatch1 head=%u", __func__,
 	    ioat->chan_idx, ioat->head);
 	KFAIL_POINT_CODE(DEBUG_FP, ioat_release, /* do nothing */);
 	CTR3(KTR_IOAT, "%s channel=%u dispatch2 head=%u", __func__,
 	    ioat->chan_idx, ioat->head);
 
 	if (ioat->acq_head != ioat->head) {
 		ioat_write_2(ioat, IOAT_DMACOUNT_OFFSET,
 		    (uint16_t)ioat->head);
 
 		if (!callout_pending(&ioat->poll_timer)) {
-			callout_reset(&ioat->poll_timer, 1,
-			    ioat_poll_timer_callback, ioat);
+			callout_reset_on(&ioat->poll_timer, 1,
+			    ioat_poll_timer_callback, ioat, ioat->cpu);
 		}
 	}
 	mtx_unlock(&ioat->submit_lock);
 }
 
 static struct ioat_descriptor *
 ioat_op_generic(struct ioat_softc *ioat, uint8_t op,
     uint32_t size, uint64_t src, uint64_t dst,
     bus_dmaengine_callback_t callback_fn, void *callback_arg,
     uint32_t flags)
 {
 	struct ioat_generic_hw_descriptor *hw_desc;
 	struct ioat_descriptor *desc;
 	bus_dma_segment_t seg;
 	int mflags, nseg, error;
 
 	mtx_assert(&ioat->submit_lock, MA_OWNED);
 
 	KASSERT((flags & ~_DMA_GENERIC_FLAGS) == 0,
 	    ("Unrecognized flag(s): %#x", flags & ~_DMA_GENERIC_FLAGS));
 	if ((flags & DMA_NO_WAIT) != 0)
 		mflags = M_NOWAIT;
 	else
 		mflags = M_WAITOK;
 
 	if (size > ioat->max_xfer_size) {
 		ioat_log_message(0, "%s: max_xfer_size = %d, requested = %u\n",
 		    __func__, ioat->max_xfer_size, (unsigned)size);
 		return (NULL);
 	}
 
 	if (ioat_reserve_space(ioat, 1, mflags) != 0)
 		return (NULL);
 
 	desc = ioat_get_ring_entry(ioat, ioat->head);
 	hw_desc = &ioat_get_descriptor(ioat, ioat->head)->generic;
 
 	hw_desc->u.control_raw = 0;
 	hw_desc->u.control_generic.op = op;
 	hw_desc->u.control_generic.completion_update = 1;
 
 	if ((flags & DMA_INT_EN) != 0)
 		hw_desc->u.control_generic.int_enable = 1;
 	if ((flags & DMA_FENCE) != 0)
 		hw_desc->u.control_generic.fence = 1;
 
 	hw_desc->size = size;
 
 	if (src != 0) {
 		nseg = -1;
 		error = _bus_dmamap_load_phys(ioat->data_tag, desc->src_dmamap,
 		    src, size, 0, &seg, &nseg);
 		if (error != 0) {
 			ioat_log_message(0, "%s: _bus_dmamap_load_phys"
 			    " failed %d\n", __func__, error);
 			return (NULL);
 		}
 		hw_desc->src_addr = seg.ds_addr;
 	}
 
 	if (dst != 0) {
 		nseg = -1;
 		error = _bus_dmamap_load_phys(ioat->data_tag, desc->dst_dmamap,
 		    dst, size, 0, &seg, &nseg);
 		if (error != 0) {
 			ioat_log_message(0, "%s: _bus_dmamap_load_phys"
 			    " failed %d\n", __func__, error);
 			return (NULL);
 		}
 		hw_desc->dest_addr = seg.ds_addr;
 	}
 
 	desc->bus_dmadesc.callback_fn = callback_fn;
 	desc->bus_dmadesc.callback_arg = callback_arg;
 	return (desc);
 }
 
 struct bus_dmadesc *
 ioat_null(bus_dmaengine_t dmaengine, bus_dmaengine_callback_t callback_fn,
     void *callback_arg, uint32_t flags)
 {
 	struct ioat_dma_hw_descriptor *hw_desc;
 	struct ioat_descriptor *desc;
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx);
 
 	desc = ioat_op_generic(ioat, IOAT_OP_COPY, 8, 0, 0, callback_fn,
 	    callback_arg, flags);
 	if (desc == NULL)
 		return (NULL);
 
 	hw_desc = &ioat_get_descriptor(ioat, desc->id)->dma;
 	hw_desc->u.control.null = 1;
 	ioat_submit_single(ioat);
 	return (&desc->bus_dmadesc);
 }
 
 struct bus_dmadesc *
 ioat_copy(bus_dmaengine_t dmaengine, bus_addr_t dst,
     bus_addr_t src, bus_size_t len, bus_dmaengine_callback_t callback_fn,
     void *callback_arg, uint32_t flags)
 {
 	struct ioat_dma_hw_descriptor *hw_desc;
 	struct ioat_descriptor *desc;
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 
 	if (((src | dst) & (0xffffull << 48)) != 0) {
 		ioat_log_message(0, "%s: High 16 bits of src/dst invalid\n",
 		    __func__);
 		return (NULL);
 	}
 
 	desc = ioat_op_generic(ioat, IOAT_OP_COPY, len, src, dst, callback_fn,
 	    callback_arg, flags);
 	if (desc == NULL)
 		return (NULL);
 
 	hw_desc = &ioat_get_descriptor(ioat, desc->id)->dma;
 	if (g_ioat_debug_level >= 3)
 		dump_descriptor(hw_desc);
 
 	ioat_submit_single(ioat);
 	CTR6(KTR_IOAT, "%s channel=%u desc=%p dest=%lx src=%lx len=%lx",
 	    __func__, ioat->chan_idx, &desc->bus_dmadesc, dst, src, len);
 	return (&desc->bus_dmadesc);
 }
 
 struct bus_dmadesc *
 ioat_copy_8k_aligned(bus_dmaengine_t dmaengine, bus_addr_t dst1,
     bus_addr_t dst2, bus_addr_t src1, bus_addr_t src2,
     bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags)
 {
 	struct ioat_dma_hw_descriptor *hw_desc;
 	struct ioat_descriptor *desc;
 	struct ioat_softc *ioat;
 	bus_size_t src1_len, dst1_len;
 	bus_dma_segment_t seg;
 	int nseg, error;
 
 	ioat = to_ioat_softc(dmaengine);
 	CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx);
 
 	if (((src1 | src2 | dst1 | dst2) & (0xffffull << 48)) != 0) {
 		ioat_log_message(0, "%s: High 16 bits of src/dst invalid\n",
 		    __func__);
 		return (NULL);
 	}
 	if (((src1 | src2 | dst1 | dst2) & PAGE_MASK) != 0) {
 		ioat_log_message(0, "%s: Addresses must be page-aligned\n",
 		    __func__);
 		return (NULL);
 	}
 
 	desc = ioat_op_generic(ioat, IOAT_OP_COPY, 2 * PAGE_SIZE, 0, 0,
 	    callback_fn, callback_arg, flags);
 	if (desc == NULL)
 		return (NULL);
 
 	hw_desc = &ioat_get_descriptor(ioat, desc->id)->dma;
 
 	src1_len = (src2 != src1 + PAGE_SIZE) ? PAGE_SIZE : 2 * PAGE_SIZE;
 	nseg = -1;
 	error = _bus_dmamap_load_phys(ioat->data_tag,
 	    desc->src_dmamap, src1, src1_len, 0, &seg, &nseg);
 	if (error != 0) {
 		ioat_log_message(0, "%s: _bus_dmamap_load_phys"
 		    " failed %d\n", __func__, error);
 		return (NULL);
 	}
 	hw_desc->src_addr = seg.ds_addr;
 	if (src1_len != 2 * PAGE_SIZE) {
 		hw_desc->u.control.src_page_break = 1;
 		nseg = -1;
 		error = _bus_dmamap_load_phys(ioat->data_tag,
 		    desc->src2_dmamap, src2, PAGE_SIZE, 0, &seg, &nseg);
 		if (error != 0) {
 			ioat_log_message(0, "%s: _bus_dmamap_load_phys"
 			    " failed %d\n", __func__, error);
 			return (NULL);
 		}
 		hw_desc->next_src_addr = seg.ds_addr;
 	}
 
 	dst1_len = (dst2 != dst1 + PAGE_SIZE) ? PAGE_SIZE : 2 * PAGE_SIZE;
 	nseg = -1;
 	error = _bus_dmamap_load_phys(ioat->data_tag,
 	    desc->dst_dmamap, dst1, dst1_len, 0, &seg, &nseg);
 	if (error != 0) {
 		ioat_log_message(0, "%s: _bus_dmamap_load_phys"
 		    " failed %d\n", __func__, error);
 		return (NULL);
 	}
 	hw_desc->dest_addr = seg.ds_addr;
 	if (dst1_len != 2 * PAGE_SIZE) {
 		hw_desc->u.control.dest_page_break = 1;
 		nseg = -1;
 		error = _bus_dmamap_load_phys(ioat->data_tag,
 		    desc->dst2_dmamap, dst2, PAGE_SIZE, 0, &seg, &nseg);
 		if (error != 0) {
 			ioat_log_message(0, "%s: _bus_dmamap_load_phys"
 			    " failed %d\n", __func__, error);
 			return (NULL);
 		}
 		hw_desc->next_dest_addr = seg.ds_addr;
 	}
 
 	if (g_ioat_debug_level >= 3)
 		dump_descriptor(hw_desc);
 
 	ioat_submit_single(ioat);
 	return (&desc->bus_dmadesc);
 }
 
 struct bus_dmadesc *
 ioat_copy_crc(bus_dmaengine_t dmaengine, bus_addr_t dst, bus_addr_t src,
     bus_size_t len, uint32_t *initialseed, bus_addr_t crcptr,
     bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags)
 {
 	struct ioat_crc32_hw_descriptor *hw_desc;
 	struct ioat_descriptor *desc;
 	struct ioat_softc *ioat;
 	uint32_t teststore;
 	uint8_t op;
 	bus_dma_segment_t seg;
 	int nseg, error;
 
 	ioat = to_ioat_softc(dmaengine);
 	CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx);
 
 	if ((ioat->capabilities & IOAT_DMACAP_MOVECRC) == 0) {
 		ioat_log_message(0, "%s: Device lacks MOVECRC capability\n",
 		    __func__);
 		return (NULL);
 	}
 	if (((src | dst) & (0xffffffull << 40)) != 0) {
 		ioat_log_message(0, "%s: High 24 bits of src/dst invalid\n",
 		    __func__);
 		return (NULL);
 	}
 	teststore = (flags & _DMA_CRC_TESTSTORE);
 	if (teststore == _DMA_CRC_TESTSTORE) {
 		ioat_log_message(0, "%s: TEST and STORE invalid\n", __func__);
 		return (NULL);
 	}
 	if (teststore == 0 && (flags & DMA_CRC_INLINE) != 0) {
 		ioat_log_message(0, "%s: INLINE invalid without TEST or STORE\n",
 		    __func__);
 		return (NULL);
 	}
 
 	switch (teststore) {
 	case DMA_CRC_STORE:
 		op = IOAT_OP_MOVECRC_STORE;
 		break;
 	case DMA_CRC_TEST:
 		op = IOAT_OP_MOVECRC_TEST;
 		break;
 	default:
 		KASSERT(teststore == 0, ("bogus"));
 		op = IOAT_OP_MOVECRC;
 		break;
 	}
 
 	if ((flags & DMA_CRC_INLINE) == 0 &&
 	    (crcptr & (0xffffffull << 40)) != 0) {
 		ioat_log_message(0,
 		    "%s: High 24 bits of crcptr invalid\n", __func__);
 		return (NULL);
 	}
 
 	desc = ioat_op_generic(ioat, op, len, src, dst, callback_fn,
 	    callback_arg, flags & ~_DMA_CRC_FLAGS);
 	if (desc == NULL)
 		return (NULL);
 
 	hw_desc = &ioat_get_descriptor(ioat, desc->id)->crc32;
 
 	if ((flags & DMA_CRC_INLINE) == 0) {
 		nseg = -1;
 		error = _bus_dmamap_load_phys(ioat->data_crc_tag,
 		    desc->crc_dmamap, crcptr, sizeof(uint32_t), 0,
 		    &seg, &nseg);
 		if (error != 0) {
 			ioat_log_message(0, "%s: _bus_dmamap_load_phys"
 			    " failed %d\n", __func__, error);
 			return (NULL);
 		}
 		hw_desc->crc_address = seg.ds_addr;
 	} else
 		hw_desc->u.control.crc_location = 1;
 
 	if (initialseed != NULL) {
 		hw_desc->u.control.use_seed = 1;
 		hw_desc->seed = *initialseed;
 	}
 
 	if (g_ioat_debug_level >= 3)
 		dump_descriptor(hw_desc);
 
 	ioat_submit_single(ioat);
 	return (&desc->bus_dmadesc);
 }
 
 struct bus_dmadesc *
 ioat_crc(bus_dmaengine_t dmaengine, bus_addr_t src, bus_size_t len,
     uint32_t *initialseed, bus_addr_t crcptr,
     bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags)
 {
 	struct ioat_crc32_hw_descriptor *hw_desc;
 	struct ioat_descriptor *desc;
 	struct ioat_softc *ioat;
 	uint32_t teststore;
 	uint8_t op;
 	bus_dma_segment_t seg;
 	int nseg, error;
 
 	ioat = to_ioat_softc(dmaengine);
 	CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx);
 
 	if ((ioat->capabilities & IOAT_DMACAP_CRC) == 0) {
 		ioat_log_message(0, "%s: Device lacks CRC capability\n",
 		    __func__);
 		return (NULL);
 	}
 	if ((src & (0xffffffull << 40)) != 0) {
 		ioat_log_message(0, "%s: High 24 bits of src invalid\n",
 		    __func__);
 		return (NULL);
 	}
 	teststore = (flags & _DMA_CRC_TESTSTORE);
 	if (teststore == _DMA_CRC_TESTSTORE) {
 		ioat_log_message(0, "%s: TEST and STORE invalid\n", __func__);
 		return (NULL);
 	}
 	if (teststore == 0 && (flags & DMA_CRC_INLINE) != 0) {
 		ioat_log_message(0, "%s: INLINE invalid without TEST or STORE\n",
 		    __func__);
 		return (NULL);
 	}
 
 	switch (teststore) {
 	case DMA_CRC_STORE:
 		op = IOAT_OP_CRC_STORE;
 		break;
 	case DMA_CRC_TEST:
 		op = IOAT_OP_CRC_TEST;
 		break;
 	default:
 		KASSERT(teststore == 0, ("bogus"));
 		op = IOAT_OP_CRC;
 		break;
 	}
 
 	if ((flags & DMA_CRC_INLINE) == 0 &&
 	    (crcptr & (0xffffffull << 40)) != 0) {
 		ioat_log_message(0,
 		    "%s: High 24 bits of crcptr invalid\n", __func__);
 		return (NULL);
 	}
 
 	desc = ioat_op_generic(ioat, op, len, src, 0, callback_fn,
 	    callback_arg, flags & ~_DMA_CRC_FLAGS);
 	if (desc == NULL)
 		return (NULL);
 
 	hw_desc = &ioat_get_descriptor(ioat, desc->id)->crc32;
 
 	if ((flags & DMA_CRC_INLINE) == 0) {
 		nseg = -1;
 		error = _bus_dmamap_load_phys(ioat->data_crc_tag,
 		    desc->crc_dmamap, crcptr, sizeof(uint32_t), 0,
 		    &seg, &nseg);
 		if (error != 0) {
 			ioat_log_message(0, "%s: _bus_dmamap_load_phys"
 			    " failed %d\n", __func__, error);
 			return (NULL);
 		}
 		hw_desc->crc_address = seg.ds_addr;
 	} else
 		hw_desc->u.control.crc_location = 1;
 
 	if (initialseed != NULL) {
 		hw_desc->u.control.use_seed = 1;
 		hw_desc->seed = *initialseed;
 	}
 
 	if (g_ioat_debug_level >= 3)
 		dump_descriptor(hw_desc);
 
 	ioat_submit_single(ioat);
 	return (&desc->bus_dmadesc);
 }
 
 struct bus_dmadesc *
 ioat_blockfill(bus_dmaengine_t dmaengine, bus_addr_t dst, uint64_t fillpattern,
     bus_size_t len, bus_dmaengine_callback_t callback_fn, void *callback_arg,
     uint32_t flags)
 {
 	struct ioat_fill_hw_descriptor *hw_desc;
 	struct ioat_descriptor *desc;
 	struct ioat_softc *ioat;
 
 	ioat = to_ioat_softc(dmaengine);
 	CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx);
 
 	if ((ioat->capabilities & IOAT_DMACAP_BFILL) == 0) {
 		ioat_log_message(0, "%s: Device lacks BFILL capability\n",
 		    __func__);
 		return (NULL);
 	}
 
 	if ((dst & (0xffffull << 48)) != 0) {
 		ioat_log_message(0, "%s: High 16 bits of dst invalid\n",
 		    __func__);
 		return (NULL);
 	}
 
 	desc = ioat_op_generic(ioat, IOAT_OP_FILL, len, 0, dst,
 	    callback_fn, callback_arg, flags);
 	if (desc == NULL)
 		return (NULL);
 
 	hw_desc = &ioat_get_descriptor(ioat, desc->id)->fill;
 	hw_desc->src_data = fillpattern;
 	if (g_ioat_debug_level >= 3)
 		dump_descriptor(hw_desc);
 
 	ioat_submit_single(ioat);
 	return (&desc->bus_dmadesc);
 }
 
 /*
  * Ring Management
  */
 static inline uint32_t
 ioat_get_active(struct ioat_softc *ioat)
 {
 
 	return ((ioat->head - ioat->tail) & ((1 << ioat->ring_size_order) - 1));
 }
 
 static inline uint32_t
 ioat_get_ring_space(struct ioat_softc *ioat)
 {
 
 	return ((1 << ioat->ring_size_order) - ioat_get_active(ioat) - 1);
 }
 
 /*
  * Reserves space in this IOAT descriptor ring by ensuring enough slots remain
  * for 'num_descs'.
  *
  * If mflags contains M_WAITOK, blocks until enough space is available.
  *
  * Returns zero on success, or an errno on error.  If num_descs is beyond the
  * maximum ring size, returns EINVAl; if allocation would block and mflags
  * contains M_NOWAIT, returns EAGAIN.
  *
  * Must be called with the submit_lock held; returns with the lock held.  The
  * lock may be dropped to allocate the ring.
  *
  * (The submit_lock is needed to add any entries to the ring, so callers are
  * assured enough room is available.)
  */
 static int
 ioat_reserve_space(struct ioat_softc *ioat, uint32_t num_descs, int mflags)
 {
 	boolean_t dug;
 	int error;
 
 	mtx_assert(&ioat->submit_lock, MA_OWNED);
 	error = 0;
 	dug = FALSE;
 
 	if (num_descs < 1 || num_descs >= (1 << ioat->ring_size_order)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	for (;;) {
 		if (ioat->quiescing) {
 			error = ENXIO;
 			goto out;
 		}
 
 		if (ioat_get_ring_space(ioat) >= num_descs)
 			goto out;
 
 		CTR3(KTR_IOAT, "%s channel=%u starved (%u)", __func__,
 		    ioat->chan_idx, num_descs);
 
 		if (!dug && !ioat->is_submitter_processing) {
 			ioat->is_submitter_processing = TRUE;
 			mtx_unlock(&ioat->submit_lock);
 
 			CTR2(KTR_IOAT, "%s channel=%u attempting to process events",
 			    __func__, ioat->chan_idx);
 			ioat_process_events(ioat, FALSE);
 
 			mtx_lock(&ioat->submit_lock);
 			dug = TRUE;
 			KASSERT(ioat->is_submitter_processing == TRUE,
 			    ("is_submitter_processing"));
 			ioat->is_submitter_processing = FALSE;
 			wakeup(&ioat->tail);
 			continue;
 		}
 
 		if ((mflags & M_WAITOK) == 0) {
 			error = EAGAIN;
 			break;
 		}
 		CTR2(KTR_IOAT, "%s channel=%u blocking on completions",
 		    __func__, ioat->chan_idx);
 		msleep(&ioat->tail, &ioat->submit_lock, 0,
 		    "ioat_full", 0);
 		continue;
 	}
 
 out:
 	mtx_assert(&ioat->submit_lock, MA_OWNED);
 	KASSERT(!ioat->quiescing || error == ENXIO,
 	    ("reserved during quiesce"));
 	return (error);
 }
 
 static void
 ioat_free_ring(struct ioat_softc *ioat, uint32_t size,
     struct ioat_descriptor *ring)
 {
 
-	free(ring, M_IOAT);
+	free_domain(ring, M_IOAT);
 }
 
 static struct ioat_descriptor *
 ioat_get_ring_entry(struct ioat_softc *ioat, uint32_t index)
 {
 
 	return (&ioat->ring[index % (1 << ioat->ring_size_order)]);
 }
 
 static union ioat_hw_descriptor *
 ioat_get_descriptor(struct ioat_softc *ioat, uint32_t index)
 {
 
 	return (&ioat->hw_desc_ring[index % (1 << ioat->ring_size_order)]);
 }
 
 static void
 ioat_halted_debug(struct ioat_softc *ioat, uint32_t chanerr)
 {
 	union ioat_hw_descriptor *desc;
 
 	ioat_log_message(0, "Channel halted (%b)\n", (int)chanerr,
 	    IOAT_CHANERR_STR);
 	if (chanerr == 0)
 		return;
 
 	mtx_assert(&ioat->cleanup_lock, MA_OWNED);
 
 	desc = ioat_get_descriptor(ioat, ioat->tail + 0);
 	dump_descriptor(desc);
 
 	desc = ioat_get_descriptor(ioat, ioat->tail + 1);
 	dump_descriptor(desc);
 }
 
 static void
 ioat_poll_timer_callback(void *arg)
 {
 	struct ioat_softc *ioat;
 
 	ioat = arg;
 	ioat_log_message(3, "%s\n", __func__);
 
 	ioat_process_events(ioat, FALSE);
 
 	mtx_lock(&ioat->submit_lock);
 	if (ioat_get_active(ioat) > 0)
 		callout_schedule(&ioat->poll_timer, 1);
 	mtx_unlock(&ioat->submit_lock);
 }
 
 /*
  * Support Functions
  */
 static void
 ioat_submit_single(struct ioat_softc *ioat)
 {
 
 	mtx_assert(&ioat->submit_lock, MA_OWNED);
 
 	ioat->head++;
 	CTR4(KTR_IOAT, "%s channel=%u head=%u tail=%u", __func__,
 	    ioat->chan_idx, ioat->head, ioat->tail);
 
 	ioat->stats.descriptors_submitted++;
 }
 
 static int
 ioat_reset_hw(struct ioat_softc *ioat)
 {
 	uint64_t status;
 	uint32_t chanerr;
 	unsigned timeout;
 	int error;
 
 	CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx);
 
 	mtx_lock(&ioat->submit_lock);
 	while (ioat->resetting && !ioat->destroying)
 		msleep(&ioat->resetting, &ioat->submit_lock, 0, "IRH_drain", 0);
 	if (ioat->destroying) {
 		mtx_unlock(&ioat->submit_lock);
 		return (ENXIO);
 	}
 	ioat->resetting = TRUE;
 	ioat->quiescing = TRUE;
 	mtx_unlock(&ioat->submit_lock);
 	mtx_lock(&ioat->cleanup_lock);
 	while (ioat_get_active(ioat) > 0)
 		msleep(&ioat->tail, &ioat->cleanup_lock, 0, "ioat_drain", 1);
 
 	/*
 	 * Suspend ioat_process_events while the hardware and softc are in an
 	 * indeterminate state.
 	 */
 	ioat->resetting_cleanup = TRUE;
 	mtx_unlock(&ioat->cleanup_lock);
 
 	CTR2(KTR_IOAT, "%s channel=%u quiesced and drained", __func__,
 	    ioat->chan_idx);
 
 	status = ioat_get_chansts(ioat);
 	if (is_ioat_active(status) || is_ioat_idle(status))
 		ioat_suspend(ioat);
 
 	/* Wait at most 20 ms */
 	for (timeout = 0; (is_ioat_active(status) || is_ioat_idle(status)) &&
 	    timeout < 20; timeout++) {
 		DELAY(1000);
 		status = ioat_get_chansts(ioat);
 	}
 	if (timeout == 20) {
 		error = ETIMEDOUT;
 		goto out;
 	}
 
 	KASSERT(ioat_get_active(ioat) == 0, ("active after quiesce"));
 
 	chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET);
 	ioat_write_4(ioat, IOAT_CHANERR_OFFSET, chanerr);
 
 	CTR2(KTR_IOAT, "%s channel=%u hardware suspended", __func__,
 	    ioat->chan_idx);
 
 	/*
 	 * IOAT v3 workaround - CHANERRMSK_INT with 3E07h to masks out errors
 	 *  that can cause stability issues for IOAT v3.
 	 */
 	pci_write_config(ioat->device, IOAT_CFG_CHANERRMASK_INT_OFFSET, 0x3e07,
 	    4);
 	chanerr = pci_read_config(ioat->device, IOAT_CFG_CHANERR_INT_OFFSET, 4);
 	pci_write_config(ioat->device, IOAT_CFG_CHANERR_INT_OFFSET, chanerr, 4);
 
 	/*
 	 * BDXDE and BWD models reset MSI-X registers on device reset.
 	 * Save/restore their contents manually.
 	 */
 	if (ioat_model_resets_msix(ioat)) {
 		ioat_log_message(1, "device resets MSI-X registers; saving\n");
 		pci_save_state(ioat->device);
 	}
 
 	ioat_reset(ioat);
 	CTR2(KTR_IOAT, "%s channel=%u hardware reset", __func__,
 	    ioat->chan_idx);
 
 	/* Wait at most 20 ms */
 	for (timeout = 0; ioat_reset_pending(ioat) && timeout < 20; timeout++)
 		DELAY(1000);
 	if (timeout == 20) {
 		error = ETIMEDOUT;
 		goto out;
 	}
 
 	if (ioat_model_resets_msix(ioat)) {
 		ioat_log_message(1, "device resets registers; restored\n");
 		pci_restore_state(ioat->device);
 	}
 
 	/* Reset attempts to return the hardware to "halted." */
 	status = ioat_get_chansts(ioat);
 	if (is_ioat_active(status) || is_ioat_idle(status)) {
 		/* So this really shouldn't happen... */
 		ioat_log_message(0, "Device is active after a reset?\n");
 		ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN);
 		error = 0;
 		goto out;
 	}
 
 	chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET);
 	if (chanerr != 0) {
 		mtx_lock(&ioat->cleanup_lock);
 		ioat_halted_debug(ioat, chanerr);
 		mtx_unlock(&ioat->cleanup_lock);
 		error = EIO;
 		goto out;
 	}
 
 	/*
 	 * Bring device back online after reset.  Writing CHAINADDR brings the
 	 * device back to active.
 	 *
 	 * The internal ring counter resets to zero, so we have to start over
 	 * at zero as well.
 	 */
 	ioat->tail = ioat->head = 0;
 	ioat->last_seen = 0;
 	*ioat->comp_update = 0;
 
 	ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN);
 	ioat_write_chancmp(ioat, ioat->comp_update_bus_addr);
 	ioat_write_chainaddr(ioat, RING_PHYS_ADDR(ioat, 0));
 	error = 0;
 	CTR2(KTR_IOAT, "%s channel=%u configured channel", __func__,
 	    ioat->chan_idx);
 
 out:
 	/* Enqueues a null operation and ensures it completes. */
 	if (error == 0) {
 		error = ioat_start_channel(ioat);
 		CTR2(KTR_IOAT, "%s channel=%u started channel", __func__,
 		    ioat->chan_idx);
 	}
 
 	/*
 	 * Resume completions now that ring state is consistent.
 	 */
 	mtx_lock(&ioat->cleanup_lock);
 	ioat->resetting_cleanup = FALSE;
 	mtx_unlock(&ioat->cleanup_lock);
 
 	/* Unblock submission of new work */
 	mtx_lock(&ioat->submit_lock);
 	ioat->quiescing = FALSE;
 	wakeup(&ioat->quiescing);
 
 	ioat->resetting = FALSE;
 	wakeup(&ioat->resetting);
 
 	CTR2(KTR_IOAT, "%s channel=%u reset done", __func__, ioat->chan_idx);
 	mtx_unlock(&ioat->submit_lock);
 
 	return (error);
 }
 
 static int
 sysctl_handle_chansts(SYSCTL_HANDLER_ARGS)
 {
 	struct ioat_softc *ioat;
 	struct sbuf sb;
 	uint64_t status;
 	int error;
 
 	ioat = arg1;
 
 	status = ioat_get_chansts(ioat) & IOAT_CHANSTS_STATUS;
 
 	sbuf_new_for_sysctl(&sb, NULL, 256, req);
 	switch (status) {
 	case IOAT_CHANSTS_ACTIVE:
 		sbuf_printf(&sb, "ACTIVE");
 		break;
 	case IOAT_CHANSTS_IDLE:
 		sbuf_printf(&sb, "IDLE");
 		break;
 	case IOAT_CHANSTS_SUSPENDED:
 		sbuf_printf(&sb, "SUSPENDED");
 		break;
 	case IOAT_CHANSTS_HALTED:
 		sbuf_printf(&sb, "HALTED");
 		break;
 	case IOAT_CHANSTS_ARMED:
 		sbuf_printf(&sb, "ARMED");
 		break;
 	default:
 		sbuf_printf(&sb, "UNKNOWN");
 		break;
 	}
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	return (EINVAL);
 }
 
 static int
 sysctl_handle_dpi(SYSCTL_HANDLER_ARGS)
 {
 	struct ioat_softc *ioat;
 	struct sbuf sb;
 #define	PRECISION	"1"
 	const uintmax_t factor = 10;
 	uintmax_t rate;
 	int error;
 
 	ioat = arg1;
 	sbuf_new_for_sysctl(&sb, NULL, 16, req);
 
 	if (ioat->stats.interrupts == 0) {
 		sbuf_printf(&sb, "NaN");
 		goto out;
 	}
 	rate = ioat->stats.descriptors_processed * factor /
 	    ioat->stats.interrupts;
 	sbuf_printf(&sb, "%ju.%." PRECISION "ju", rate / factor,
 	    rate % factor);
 #undef	PRECISION
 out:
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	return (EINVAL);
 }
 
 static int
 sysctl_handle_reset(SYSCTL_HANDLER_ARGS)
 {
 	struct ioat_softc *ioat;
 	int error, arg;
 
 	ioat = arg1;
 
 	arg = 0;
 	error = SYSCTL_OUT(req, &arg, sizeof(arg));
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = SYSCTL_IN(req, &arg, sizeof(arg));
 	if (error != 0)
 		return (error);
 
 	if (arg != 0)
 		error = ioat_reset_hw(ioat);
 
 	return (error);
 }
 
 static void
 dump_descriptor(void *hw_desc)
 {
 	int i, j;
 
 	for (i = 0; i < 2; i++) {
 		for (j = 0; j < 8; j++)
 			printf("%08x ", ((uint32_t *)hw_desc)[i * 8 + j]);
 		printf("\n");
 	}
 }
 
 static void
 ioat_setup_sysctl(device_t device)
 {
 	struct sysctl_oid_list *par, *statpar, *state, *hammer;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *tree, *tmp;
 	struct ioat_softc *ioat;
 
 	ioat = DEVICE2SOFTC(device);
 	ctx = device_get_sysctl_ctx(device);
 	tree = device_get_sysctl_tree(device);
 	par = SYSCTL_CHILDREN(tree);
 
 	SYSCTL_ADD_INT(ctx, par, OID_AUTO, "version", CTLFLAG_RD,
 	    &ioat->version, 0, "HW version (0xMM form)");
 	SYSCTL_ADD_UINT(ctx, par, OID_AUTO, "max_xfer_size", CTLFLAG_RD,
 	    &ioat->max_xfer_size, 0, "HW maximum transfer size");
 	SYSCTL_ADD_INT(ctx, par, OID_AUTO, "intrdelay_supported", CTLFLAG_RD,
 	    &ioat->intrdelay_supported, 0, "Is INTRDELAY supported");
 	SYSCTL_ADD_U16(ctx, par, OID_AUTO, "intrdelay_max", CTLFLAG_RD,
 	    &ioat->intrdelay_max, 0,
 	    "Maximum configurable INTRDELAY on this channel (microseconds)");
 
 	tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "state", CTLFLAG_RD, NULL,
 	    "IOAT channel internal state");
 	state = SYSCTL_CHILDREN(tmp);
 
 	SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "ring_size_order", CTLFLAG_RD,
 	    &ioat->ring_size_order, 0, "SW descriptor ring size order");
 	SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "head", CTLFLAG_RD, &ioat->head,
 	    0, "SW descriptor head pointer index");
 	SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "tail", CTLFLAG_RD, &ioat->tail,
 	    0, "SW descriptor tail pointer index");
 
 	SYSCTL_ADD_UQUAD(ctx, state, OID_AUTO, "last_completion", CTLFLAG_RD,
 	    ioat->comp_update, "HW addr of last completion");
 
 	SYSCTL_ADD_INT(ctx, state, OID_AUTO, "is_submitter_processing",
 	    CTLFLAG_RD, &ioat->is_submitter_processing, 0,
 	    "submitter processing");
 
 	SYSCTL_ADD_PROC(ctx, state, OID_AUTO, "chansts",
 	    CTLTYPE_STRING | CTLFLAG_RD, ioat, 0, sysctl_handle_chansts, "A",
 	    "String of the channel status");
 
 	SYSCTL_ADD_U16(ctx, state, OID_AUTO, "intrdelay", CTLFLAG_RD,
 	    &ioat->cached_intrdelay, 0,
 	    "Current INTRDELAY on this channel (cached, microseconds)");
 
 	tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "hammer", CTLFLAG_RD, NULL,
 	    "Big hammers (mostly for testing)");
 	hammer = SYSCTL_CHILDREN(tmp);
 
 	SYSCTL_ADD_PROC(ctx, hammer, OID_AUTO, "force_hw_reset",
 	    CTLTYPE_INT | CTLFLAG_RW, ioat, 0, sysctl_handle_reset, "I",
 	    "Set to non-zero to reset the hardware");
 
 	tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "stats", CTLFLAG_RD, NULL,
 	    "IOAT channel statistics");
 	statpar = SYSCTL_CHILDREN(tmp);
 
 	SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "interrupts", CTLFLAG_RW,
 	    &ioat->stats.interrupts,
 	    "Number of interrupts processed on this channel");
 	SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "descriptors", CTLFLAG_RW,
 	    &ioat->stats.descriptors_processed,
 	    "Number of descriptors processed on this channel");
 	SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "submitted", CTLFLAG_RW,
 	    &ioat->stats.descriptors_submitted,
 	    "Number of descriptors submitted to this channel");
 	SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "errored", CTLFLAG_RW,
 	    &ioat->stats.descriptors_error,
 	    "Number of descriptors failed by channel errors");
 	SYSCTL_ADD_U32(ctx, statpar, OID_AUTO, "halts", CTLFLAG_RW,
 	    &ioat->stats.channel_halts, 0,
 	    "Number of times the channel has halted");
 	SYSCTL_ADD_U32(ctx, statpar, OID_AUTO, "last_halt_chanerr", CTLFLAG_RW,
 	    &ioat->stats.last_halt_chanerr, 0,
 	    "The raw CHANERR when the channel was last halted");
 
 	SYSCTL_ADD_PROC(ctx, statpar, OID_AUTO, "desc_per_interrupt",
 	    CTLTYPE_STRING | CTLFLAG_RD, ioat, 0, sysctl_handle_dpi, "A",
 	    "Descriptors per interrupt");
 }
 
 static void
 ioat_get(struct ioat_softc *ioat)
 {
 
 	mtx_assert(&ioat->submit_lock, MA_OWNED);
 	KASSERT(ioat->refcnt < UINT32_MAX, ("refcnt overflow"));
 
 	ioat->refcnt++;
 }
 
 static void
 ioat_put(struct ioat_softc *ioat)
 {
 
 	mtx_assert(&ioat->submit_lock, MA_OWNED);
 	KASSERT(ioat->refcnt >= 1, ("refcnt error"));
 
 	if (--ioat->refcnt == 0)
 		wakeup(&ioat->refcnt);
 }
 
 static void
 ioat_drain_locked(struct ioat_softc *ioat)
 {
 
 	mtx_assert(&ioat->submit_lock, MA_OWNED);
 
 	while (ioat->refcnt > 0)
 		msleep(&ioat->refcnt, &ioat->submit_lock, 0, "ioat_drain", 0);
 }
 
 #ifdef DDB
 #define	_db_show_lock(lo)	LOCK_CLASS(lo)->lc_ddb_show(lo)
 #define	db_show_lock(lk)	_db_show_lock(&(lk)->lock_object)
 DB_SHOW_COMMAND(ioat, db_show_ioat)
 {
 	struct ioat_softc *sc;
 	unsigned idx;
 
 	if (!have_addr)
 		goto usage;
 	idx = (unsigned)addr;
 	if (idx >= ioat_channel_index)
 		goto usage;
 
 	sc = ioat_channel[idx];
 	db_printf("ioat softc at %p\n", sc);
 	if (sc == NULL)
 		return;
 
 	db_printf(" version: %d\n", sc->version);
 	db_printf(" chan_idx: %u\n", sc->chan_idx);
 	db_printf(" submit_lock: ");
 	db_show_lock(&sc->submit_lock);
 
 	db_printf(" capabilities: %b\n", (int)sc->capabilities,
 	    IOAT_DMACAP_STR);
 	db_printf(" cached_intrdelay: %u\n", sc->cached_intrdelay);
 	db_printf(" *comp_update: 0x%jx\n", (uintmax_t)*sc->comp_update);
 
 	db_printf(" poll_timer:\n");
 	db_printf("  c_time: %ju\n", (uintmax_t)sc->poll_timer.c_time);
 	db_printf("  c_arg: %p\n", sc->poll_timer.c_arg);
 	db_printf("  c_func: %p\n", sc->poll_timer.c_func);
 	db_printf("  c_lock: %p\n", sc->poll_timer.c_lock);
 	db_printf("  c_flags: 0x%x\n", (unsigned)sc->poll_timer.c_flags);
 
 	db_printf(" quiescing: %d\n", (int)sc->quiescing);
 	db_printf(" destroying: %d\n", (int)sc->destroying);
 	db_printf(" is_submitter_processing: %d\n",
 	    (int)sc->is_submitter_processing);
 	db_printf(" intrdelay_supported: %d\n", (int)sc->intrdelay_supported);
 	db_printf(" resetting: %d\n", (int)sc->resetting);
 
 	db_printf(" head: %u\n", sc->head);
 	db_printf(" tail: %u\n", sc->tail);
 	db_printf(" ring_size_order: %u\n", sc->ring_size_order);
 	db_printf(" last_seen: 0x%lx\n", sc->last_seen);
 	db_printf(" ring: %p\n", sc->ring);
 	db_printf(" descriptors: %p\n", sc->hw_desc_ring);
 	db_printf(" descriptors (phys): 0x%jx\n",
 	    (uintmax_t)sc->hw_desc_bus_addr);
 
 	db_printf("  ring[%u] (tail):\n", sc->tail %
 	    (1 << sc->ring_size_order));
 	db_printf("   id: %u\n", ioat_get_ring_entry(sc, sc->tail)->id);
 	db_printf("   addr: 0x%lx\n",
 	    RING_PHYS_ADDR(sc, sc->tail));
 	db_printf("   next: 0x%lx\n",
 	     ioat_get_descriptor(sc, sc->tail)->generic.next);
 
 	db_printf("  ring[%u] (head - 1):\n", (sc->head - 1) %
 	    (1 << sc->ring_size_order));
 	db_printf("   id: %u\n", ioat_get_ring_entry(sc, sc->head - 1)->id);
 	db_printf("   addr: 0x%lx\n",
 	    RING_PHYS_ADDR(sc, sc->head - 1));
 	db_printf("   next: 0x%lx\n",
 	     ioat_get_descriptor(sc, sc->head - 1)->generic.next);
 
 	db_printf("  ring[%u] (head):\n", (sc->head) %
 	    (1 << sc->ring_size_order));
 	db_printf("   id: %u\n", ioat_get_ring_entry(sc, sc->head)->id);
 	db_printf("   addr: 0x%lx\n",
 	    RING_PHYS_ADDR(sc, sc->head));
 	db_printf("   next: 0x%lx\n",
 	     ioat_get_descriptor(sc, sc->head)->generic.next);
 
 	for (idx = 0; idx < (1 << sc->ring_size_order); idx++)
 		if ((*sc->comp_update & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK)
 		    == RING_PHYS_ADDR(sc, idx))
 			db_printf("  ring[%u] == hardware tail\n", idx);
 
 	db_printf(" cleanup_lock: ");
 	db_show_lock(&sc->cleanup_lock);
 
 	db_printf(" refcnt: %u\n", sc->refcnt);
 	db_printf(" stats:\n");
 	db_printf("  interrupts: %lu\n", sc->stats.interrupts);
 	db_printf("  descriptors_processed: %lu\n", sc->stats.descriptors_processed);
 	db_printf("  descriptors_error: %lu\n", sc->stats.descriptors_error);
 	db_printf("  descriptors_submitted: %lu\n", sc->stats.descriptors_submitted);
 
 	db_printf("  channel_halts: %u\n", sc->stats.channel_halts);
 	db_printf("  last_halt_chanerr: %u\n", sc->stats.last_halt_chanerr);
 
 	if (db_pager_quit)
 		return;
 
 	db_printf(" hw status:\n");
 	db_printf("  status: 0x%lx\n", ioat_get_chansts(sc));
 	db_printf("  chanctrl: 0x%x\n",
 	    (unsigned)ioat_read_2(sc, IOAT_CHANCTRL_OFFSET));
 	db_printf("  chancmd: 0x%x\n",
 	    (unsigned)ioat_read_1(sc, IOAT_CHANCMD_OFFSET));
 	db_printf("  dmacount: 0x%x\n",
 	    (unsigned)ioat_read_2(sc, IOAT_DMACOUNT_OFFSET));
 	db_printf("  chainaddr: 0x%lx\n",
 	    ioat_read_double_4(sc, IOAT_CHAINADDR_OFFSET_LOW));
 	db_printf("  chancmp: 0x%lx\n",
 	    ioat_read_double_4(sc, IOAT_CHANCMP_OFFSET_LOW));
 	db_printf("  chanerr: %b\n",
 	    (int)ioat_read_4(sc, IOAT_CHANERR_OFFSET), IOAT_CHANERR_STR);
 	return;
 usage:
 	db_printf("usage: show ioat <0-%u>\n", ioat_channel_index);
 	return;
 }
 #endif /* DDB */
Index: projects/clang900-import/sys/dev/ioat/ioat_internal.h
===================================================================
--- projects/clang900-import/sys/dev/ioat/ioat_internal.h	(revision 352586)
+++ projects/clang900-import/sys/dev/ioat/ioat_internal.h	(revision 352587)
@@ -1,609 +1,611 @@
 /*-
  * Copyright (C) 2012 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 __FBSDID("$FreeBSD$");
 
 #ifndef __IOAT_INTERNAL_H__
 #define __IOAT_INTERNAL_H__
 
 #include <sys/_task.h>
 
 #define	DEVICE2SOFTC(dev)	((struct ioat_softc *) device_get_softc(dev))
 #define	KTR_IOAT		KTR_SPARE3
 
 #define	ioat_read_chancnt(ioat) \
 	ioat_read_1((ioat), IOAT_CHANCNT_OFFSET)
 
 #define	ioat_read_xfercap(ioat) \
 	(ioat_read_1((ioat), IOAT_XFERCAP_OFFSET) & IOAT_XFERCAP_VALID_MASK)
 
 #define	ioat_write_intrctrl(ioat, value) \
 	ioat_write_1((ioat), IOAT_INTRCTRL_OFFSET, (value))
 
 #define	ioat_read_cbver(ioat) \
 	(ioat_read_1((ioat), IOAT_CBVER_OFFSET) & 0xFF)
 
 #define	ioat_read_dmacapability(ioat) \
 	ioat_read_4((ioat), IOAT_DMACAPABILITY_OFFSET)
 
 #define	ioat_write_chanctrl(ioat, value) \
 	ioat_write_2((ioat), IOAT_CHANCTRL_OFFSET, (value))
 
 static __inline uint64_t
 ioat_bus_space_read_8_lower_first(bus_space_tag_t tag,
     bus_space_handle_t handle, bus_size_t offset)
 {
 	return (bus_space_read_4(tag, handle, offset) |
 	    ((uint64_t)bus_space_read_4(tag, handle, offset + 4)) << 32);
 }
 
 static __inline void
 ioat_bus_space_write_8_lower_first(bus_space_tag_t tag,
     bus_space_handle_t handle, bus_size_t offset, uint64_t val)
 {
 	bus_space_write_4(tag, handle, offset, val);
 	bus_space_write_4(tag, handle, offset + 4, val >> 32);
 }
 
 #ifdef __i386__
 #define ioat_bus_space_read_8 ioat_bus_space_read_8_lower_first
 #define ioat_bus_space_write_8 ioat_bus_space_write_8_lower_first
 #else
 #define ioat_bus_space_read_8(tag, handle, offset) \
 	bus_space_read_8((tag), (handle), (offset))
 #define ioat_bus_space_write_8(tag, handle, offset, val) \
 	bus_space_write_8((tag), (handle), (offset), (val))
 #endif
 
 #define ioat_read_1(ioat, offset) \
 	bus_space_read_1((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \
 	    (offset))
 
 #define ioat_read_2(ioat, offset) \
 	bus_space_read_2((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \
 	    (offset))
 
 #define ioat_read_4(ioat, offset) \
 	bus_space_read_4((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \
 	    (offset))
 
 #define ioat_read_8(ioat, offset) \
 	ioat_bus_space_read_8((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \
 	    (offset))
 
 #define ioat_read_double_4(ioat, offset) \
 	ioat_bus_space_read_8_lower_first((ioat)->pci_bus_tag, \
 	    (ioat)->pci_bus_handle, (offset))
 
 #define ioat_write_1(ioat, offset, value) \
 	bus_space_write_1((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \
 	    (offset), (value))
 
 #define ioat_write_2(ioat, offset, value) \
 	bus_space_write_2((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \
 	    (offset), (value))
 
 #define ioat_write_4(ioat, offset, value) \
 	bus_space_write_4((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \
 	    (offset), (value))
 
 #define ioat_write_8(ioat, offset, value) \
 	ioat_bus_space_write_8((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \
 	    (offset), (value))
 
 #define ioat_write_double_4(ioat, offset, value) \
 	ioat_bus_space_write_8_lower_first((ioat)->pci_bus_tag, \
 	    (ioat)->pci_bus_handle, (offset), (value))
 
 MALLOC_DECLARE(M_IOAT);
 
 SYSCTL_DECL(_hw_ioat);
 
 extern int g_ioat_debug_level;
 
 struct generic_dma_control {
 	uint32_t int_enable:1;
 	uint32_t src_snoop_disable:1;
 	uint32_t dest_snoop_disable:1;
 	uint32_t completion_update:1;
 	uint32_t fence:1;
 	uint32_t reserved1:1;
 	uint32_t src_page_break:1;
 	uint32_t dest_page_break:1;
 	uint32_t bundle:1;
 	uint32_t dest_dca:1;
 	uint32_t hint:1;
 	uint32_t reserved2:13;
 	uint32_t op:8;
 };
 
 struct ioat_generic_hw_descriptor {
 	uint32_t size;
 	union {
 		uint32_t control_raw;
 		struct generic_dma_control control_generic;
 	} u;
 	uint64_t src_addr;
 	uint64_t dest_addr;
 	uint64_t next;
 	uint64_t reserved[4];
 };
 
 struct ioat_dma_hw_descriptor {
 	uint32_t size;
 	union {
 		uint32_t control_raw;
 		struct generic_dma_control control_generic;
 		struct {
 			uint32_t int_enable:1;
 			uint32_t src_snoop_disable:1;
 			uint32_t dest_snoop_disable:1;
 			uint32_t completion_update:1;
 			uint32_t fence:1;
 			uint32_t null:1;
 			uint32_t src_page_break:1;
 			uint32_t dest_page_break:1;
 			uint32_t bundle:1;
 			uint32_t dest_dca:1;
 			uint32_t hint:1;
 			uint32_t reserved:13;
 			#define IOAT_OP_COPY 0x00
 			uint32_t op:8;
 		} control;
 	} u;
 	uint64_t src_addr;
 	uint64_t dest_addr;
 	uint64_t next;
 	uint64_t next_src_addr;
 	uint64_t next_dest_addr;
 	uint64_t user1;
 	uint64_t user2;
 };
 
 struct ioat_fill_hw_descriptor {
 	uint32_t size;
 	union {
 		uint32_t control_raw;
 		struct generic_dma_control control_generic;
 		struct {
 			uint32_t int_enable:1;
 			uint32_t reserved:1;
 			uint32_t dest_snoop_disable:1;
 			uint32_t completion_update:1;
 			uint32_t fence:1;
 			uint32_t reserved2:2;
 			uint32_t dest_page_break:1;
 			uint32_t bundle:1;
 			uint32_t reserved3:15;
 			#define IOAT_OP_FILL 0x01
 			uint32_t op:8;
 		} control;
 	} u;
 	uint64_t src_data;
 	uint64_t dest_addr;
 	uint64_t next;
 	uint64_t reserved;
 	uint64_t next_dest_addr;
 	uint64_t user1;
 	uint64_t user2;
 };
 
 struct ioat_crc32_hw_descriptor {
 	uint32_t size;
 	union {
 		uint32_t control_raw;
 		struct generic_dma_control control_generic;
 		struct {
 			uint32_t int_enable:1;
 			uint32_t src_snoop_disable:1;
 			uint32_t dest_snoop_disable:1;
 			uint32_t completion_update:1;
 			uint32_t fence:1;
 			uint32_t reserved1:3;
 			uint32_t bundle:1;
 			uint32_t dest_dca:1;
 			uint32_t hint:1;
 			uint32_t use_seed:1;
 			/*
 			 * crc_location:
 			 * For IOAT_OP_MOVECRC_TEST and IOAT_OP_CRC_TEST:
 			 * 0: comparison value is pointed to by CRC Address
 			 *    field.
 			 * 1: comparison value follows data in wire format
 			 *    ("inverted reflected bit order") in the 4 bytes
 			 *    following the source data.
 			 *
 			 * For IOAT_OP_CRC_STORE:
 			 * 0: Result will be stored at location pointed to by
 			 *    CRC Address field (in wire format).
 			 * 1: Result will be stored directly following the
 			 *    source data.
 			 *
 			 * For IOAT_OP_MOVECRC_STORE:
 			 * 0: Result will be stored at location pointed to by
 			 *    CRC Address field (in wire format).
 			 * 1: Result will be stored directly following the
 			 *    *destination* data.
 			 */
 			uint32_t crc_location:1;
 			uint32_t reserved2:11;
 			/*
 			 * MOVECRC - Move data in the same way as standard copy
 			 * operation, but also compute CRC32.
 			 *
 			 * CRC - Only compute CRC on source data.
 			 *
 			 * There is a CRC accumulator register in the hardware.
 			 * If 'initial' is set, it is initialized to the value
 			 * in 'seed.'
 			 *
 			 * In all modes, these operators accumulate size bytes
 			 * at src_addr into the running CRC32C.
 			 *
 			 * Store mode emits the accumulated CRC, in wire
 			 * format, as specified by the crc_location bit above.
 			 *
 			 * Test mode compares the accumulated CRC against the
 			 * reference CRC, as described in crc_location above.
 			 * On failure, halts the DMA engine with a CRC error
 			 * status.
 			 */
 			#define	IOAT_OP_MOVECRC		0x41
 			#define	IOAT_OP_MOVECRC_TEST	0x42
 			#define	IOAT_OP_MOVECRC_STORE	0x43
 			#define	IOAT_OP_CRC		0x81
 			#define	IOAT_OP_CRC_TEST	0x82
 			#define	IOAT_OP_CRC_STORE	0x83
 			uint32_t op:8;
 		} control;
 	} u;
 	uint64_t src_addr;
 	uint64_t dest_addr;
 	uint64_t next;
 	uint64_t next_src_addr;
 	uint64_t next_dest_addr;
 	uint32_t seed;
 	uint32_t reserved;
 	uint64_t crc_address;
 };
 
 struct ioat_xor_hw_descriptor {
 	uint32_t size;
 	union {
 		uint32_t control_raw;
 		struct generic_dma_control control_generic;
 		struct {
 			uint32_t int_enable:1;
 			uint32_t src_snoop_disable:1;
 			uint32_t dest_snoop_disable:1;
 			uint32_t completion_update:1;
 			uint32_t fence:1;
 			uint32_t src_count:3;
 			uint32_t bundle:1;
 			uint32_t dest_dca:1;
 			uint32_t hint:1;
 			uint32_t reserved:13;
 			#define IOAT_OP_XOR 0x87
 			#define IOAT_OP_XOR_VAL 0x88
 			uint32_t op:8;
 		} control;
 	} u;
 	uint64_t src_addr;
 	uint64_t dest_addr;
 	uint64_t next;
 	uint64_t src_addr2;
 	uint64_t src_addr3;
 	uint64_t src_addr4;
 	uint64_t src_addr5;
 };
 
 struct ioat_xor_ext_hw_descriptor {
 	uint64_t src_addr6;
 	uint64_t src_addr7;
 	uint64_t src_addr8;
 	uint64_t next;
 	uint64_t reserved[4];
 };
 
 struct ioat_pq_hw_descriptor {
 	uint32_t size;
 	union {
 		uint32_t control_raw;
 		struct generic_dma_control control_generic;
 		struct {
 			uint32_t int_enable:1;
 			uint32_t src_snoop_disable:1;
 			uint32_t dest_snoop_disable:1;
 			uint32_t completion_update:1;
 			uint32_t fence:1;
 			uint32_t src_count:3;
 			uint32_t bundle:1;
 			uint32_t dest_dca:1;
 			uint32_t hint:1;
 			uint32_t p_disable:1;
 			uint32_t q_disable:1;
 			uint32_t reserved:11;
 			#define IOAT_OP_PQ 0x89
 			#define IOAT_OP_PQ_VAL 0x8a
 			uint32_t op:8;
 		} control;
 	} u;
 	uint64_t src_addr;
 	uint64_t p_addr;
 	uint64_t next;
 	uint64_t src_addr2;
 	uint64_t src_addr3;
 	uint8_t  coef[8];
 	uint64_t q_addr;
 };
 
 struct ioat_pq_ext_hw_descriptor {
 	uint64_t src_addr4;
 	uint64_t src_addr5;
 	uint64_t src_addr6;
 	uint64_t next;
 	uint64_t src_addr7;
 	uint64_t src_addr8;
 	uint64_t reserved[2];
 };
 
 struct ioat_pq_update_hw_descriptor {
 	uint32_t size;
 	union {
 		uint32_t control_raw;
 		struct generic_dma_control control_generic;
 		struct {
 			uint32_t int_enable:1;
 			uint32_t src_snoop_disable:1;
 			uint32_t dest_snoop_disable:1;
 			uint32_t completion_update:1;
 			uint32_t fence:1;
 			uint32_t src_cnt:3;
 			uint32_t bundle:1;
 			uint32_t dest_dca:1;
 			uint32_t hint:1;
 			uint32_t p_disable:1;
 			uint32_t q_disable:1;
 			uint32_t reserved:3;
 			uint32_t coef:8;
 			#define IOAT_OP_PQ_UP 0x8b
 			uint32_t op:8;
 		} control;
 	} u;
 	uint64_t src_addr;
 	uint64_t p_addr;
 	uint64_t next;
 	uint64_t src_addr2;
 	uint64_t p_src;
 	uint64_t q_src;
 	uint64_t q_addr;
 };
 
 struct ioat_raw_hw_descriptor {
 	uint64_t field[8];
 };
 
 struct bus_dmadesc {
 	bus_dmaengine_callback_t callback_fn;
 	void			 *callback_arg;
 };
 
 struct ioat_descriptor {
 	struct bus_dmadesc	bus_dmadesc;
 	uint32_t		id;
 	bus_dmamap_t		src_dmamap;
 	bus_dmamap_t		dst_dmamap;
 	bus_dmamap_t		src2_dmamap;
 	bus_dmamap_t		dst2_dmamap;
 	bus_dmamap_t		crc_dmamap;
 };
 
 /* Unused by this driver at this time. */
 #define	IOAT_OP_MARKER		0x84
 
 /*
  * Deprecated OPs -- v3 DMA generates an abort if given these.  And this driver
  * doesn't support anything older than v3.
  */
 #define	IOAT_OP_OLD_XOR		0x85
 #define	IOAT_OP_OLD_XOR_VAL	0x86
 
 /* One of these per allocated PCI device. */
 struct ioat_softc {
 	bus_dmaengine_t		dmaengine;
 #define	to_ioat_softc(_dmaeng)						\
 ({									\
 	bus_dmaengine_t *_p = (_dmaeng);				\
 	(struct ioat_softc *)((char *)_p -				\
 	    offsetof(struct ioat_softc, dmaengine));			\
 })
 
 	device_t		device;
+	int			domain;
+	int			cpu;
 	int			version;
 	unsigned		chan_idx;
 
 	bus_space_tag_t		pci_bus_tag;
 	bus_space_handle_t	pci_bus_handle;
 	struct resource		*pci_resource;
 	int			pci_resource_id;
 	uint32_t		max_xfer_size;
 	uint32_t		capabilities;
 	uint32_t		ring_size_order;
 	uint16_t		intrdelay_max;
 	uint16_t		cached_intrdelay;
 
 	int			rid;
 	struct resource		*res;
 	void			*tag;
 
 	bus_dma_tag_t		hw_desc_tag;
 	bus_dmamap_t		hw_desc_map;
 
 	bus_dma_tag_t		data_tag;
 	bus_dma_tag_t		data_crc_tag;
 
 	bus_dma_tag_t		comp_update_tag;
 	bus_dmamap_t		comp_update_map;
 	uint64_t		*comp_update;
 	bus_addr_t		comp_update_bus_addr;
 
 	boolean_t		quiescing;
 	boolean_t		destroying;
 	boolean_t		is_submitter_processing;
 	boolean_t		intrdelay_supported;
 	boolean_t		resetting;		/* submit_lock */
 	boolean_t		resetting_cleanup;	/* cleanup_lock */
 
 	struct ioat_descriptor	*ring;
 
 	union ioat_hw_descriptor {
 		struct ioat_generic_hw_descriptor	generic;
 		struct ioat_dma_hw_descriptor		dma;
 		struct ioat_fill_hw_descriptor		fill;
 		struct ioat_crc32_hw_descriptor		crc32;
 		struct ioat_xor_hw_descriptor		xor;
 		struct ioat_xor_ext_hw_descriptor	xor_ext;
 		struct ioat_pq_hw_descriptor		pq;
 		struct ioat_pq_ext_hw_descriptor	pq_ext;
 		struct ioat_raw_hw_descriptor		raw;
 	} *hw_desc_ring;
 	bus_addr_t		hw_desc_bus_addr;
 #define	RING_PHYS_ADDR(sc, i)	(sc)->hw_desc_bus_addr + \
     (((i) % (1 << (sc)->ring_size_order)) * sizeof(struct ioat_dma_hw_descriptor))
 
 	struct mtx_padalign	submit_lock;
 	struct callout		poll_timer;
 	struct task		reset_task;
 	struct mtx_padalign	cleanup_lock;
 
 	uint32_t		refcnt;
 	uint32_t		head;
 	uint32_t		acq_head;
 	uint32_t		tail;
 	bus_addr_t		last_seen;
 
 	struct {
 		uint64_t	interrupts;
 		uint64_t	descriptors_processed;
 		uint64_t	descriptors_error;
 		uint64_t	descriptors_submitted;
 
 		uint32_t	channel_halts;
 		uint32_t	last_halt_chanerr;
 	} stats;
 };
 
 void ioat_test_attach(void);
 void ioat_test_detach(void);
 
 /*
  * XXX DO NOT USE this routine for obtaining the current completed descriptor.
  *
  * The double_4 read on ioat<3.3 appears to result in torn reads.  And v3.2
  * hardware is still commonplace (Broadwell Xeon has it).  Instead, use the
  * device-pushed *comp_update.
  *
  * It is safe to use ioat_get_chansts() for the low status bits.
  */
 static inline uint64_t
 ioat_get_chansts(struct ioat_softc *ioat)
 {
 	uint64_t status;
 
 	if (ioat->version >= IOAT_VER_3_3)
 		status = ioat_read_8(ioat, IOAT_CHANSTS_OFFSET);
 	else
 		/* Must read lower 4 bytes before upper 4 bytes. */
 		status = ioat_read_double_4(ioat, IOAT_CHANSTS_OFFSET);
 	return (status);
 }
 
 static inline void
 ioat_write_chancmp(struct ioat_softc *ioat, uint64_t addr)
 {
 
 	if (ioat->version >= IOAT_VER_3_3)
 		ioat_write_8(ioat, IOAT_CHANCMP_OFFSET_LOW, addr);
 	else
 		ioat_write_double_4(ioat, IOAT_CHANCMP_OFFSET_LOW, addr);
 }
 
 static inline void
 ioat_write_chainaddr(struct ioat_softc *ioat, uint64_t addr)
 {
 
 	if (ioat->version >= IOAT_VER_3_3)
 		ioat_write_8(ioat, IOAT_CHAINADDR_OFFSET_LOW, addr);
 	else
 		ioat_write_double_4(ioat, IOAT_CHAINADDR_OFFSET_LOW, addr);
 }
 
 static inline boolean_t
 is_ioat_active(uint64_t status)
 {
 	return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_ACTIVE);
 }
 
 static inline boolean_t
 is_ioat_idle(uint64_t status)
 {
 	return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_IDLE);
 }
 
 static inline boolean_t
 is_ioat_halted(uint64_t status)
 {
 	return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_HALTED);
 }
 
 static inline boolean_t
 is_ioat_suspended(uint64_t status)
 {
 	return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_SUSPENDED);
 }
 
 static inline void
 ioat_suspend(struct ioat_softc *ioat)
 {
 	ioat_write_1(ioat, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_SUSPEND);
 }
 
 static inline void
 ioat_reset(struct ioat_softc *ioat)
 {
 	ioat_write_1(ioat, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET);
 }
 
 static inline boolean_t
 ioat_reset_pending(struct ioat_softc *ioat)
 {
 	uint8_t cmd;
 
 	cmd = ioat_read_1(ioat, IOAT_CHANCMD_OFFSET);
 	return ((cmd & IOAT_CHANCMD_RESET) != 0);
 }
 
 #endif /* __IOAT_INTERNAL_H__ */
Index: projects/clang900-import/sys/dev/usb/controller/xhci.c
===================================================================
--- projects/clang900-import/sys/dev/usb/controller/xhci.c	(revision 352586)
+++ projects/clang900-import/sys/dev/usb/controller/xhci.c	(revision 352587)
@@ -1,4369 +1,4372 @@
 /* $FreeBSD$ */
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Hans Petter Selasky. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * USB eXtensible Host Controller Interface, a.k.a. USB 3.0 controller.
  *
  * The XHCI 1.0 spec can be found at
  * http://www.intel.com/technology/usb/download/xHCI_Specification_for_USB.pdf
  * and the USB 3.0 spec at
  * http://www.usb.org/developers/docs/usb_30_spec_060910.zip
  */
 
 /*
  * A few words about the design implementation: This driver emulates
  * the concept about TDs which is found in EHCI specification. This
  * way we achieve that the USB controller drivers look similar to
  * eachother which makes it easier to understand the code.
  */
 
 #ifdef USB_GLOBAL_INCLUDE_FILE
 #include USB_GLOBAL_INCLUDE_FILE
 #else
 #include <sys/stdint.h>
 #include <sys/stddef.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 
 #include <dev/usb/usb.h>
 #include <dev/usb/usbdi.h>
 
 #define	USB_DEBUG_VAR xhcidebug
 
 #include <dev/usb/usb_core.h>
 #include <dev/usb/usb_debug.h>
 #include <dev/usb/usb_busdma.h>
 #include <dev/usb/usb_process.h>
 #include <dev/usb/usb_transfer.h>
 #include <dev/usb/usb_device.h>
 #include <dev/usb/usb_hub.h>
 #include <dev/usb/usb_util.h>
 
 #include <dev/usb/usb_controller.h>
 #include <dev/usb/usb_bus.h>
 #endif			/* USB_GLOBAL_INCLUDE_FILE */
 
 #include <dev/usb/controller/xhci.h>
 #include <dev/usb/controller/xhcireg.h>
 
 #define	XHCI_BUS2SC(bus) \
    ((struct xhci_softc *)(((uint8_t *)(bus)) - \
     ((uint8_t *)&(((struct xhci_softc *)0)->sc_bus))))
 
 static SYSCTL_NODE(_hw_usb, OID_AUTO, xhci, CTLFLAG_RW, 0, "USB XHCI");
 
 static int xhcistreams;
 SYSCTL_INT(_hw_usb_xhci, OID_AUTO, streams, CTLFLAG_RWTUN,
     &xhcistreams, 0, "Set to enable streams mode support");
 
 #ifdef USB_DEBUG
 static int xhcidebug;
 static int xhciroute;
 static int xhcipolling;
 static int xhcidma32;
 static int xhcictlstep;
 
 SYSCTL_INT(_hw_usb_xhci, OID_AUTO, debug, CTLFLAG_RWTUN,
     &xhcidebug, 0, "Debug level");
 SYSCTL_INT(_hw_usb_xhci, OID_AUTO, xhci_port_route, CTLFLAG_RWTUN,
     &xhciroute, 0, "Routing bitmap for switching EHCI ports to the XHCI controller");
 SYSCTL_INT(_hw_usb_xhci, OID_AUTO, use_polling, CTLFLAG_RWTUN,
     &xhcipolling, 0, "Set to enable software interrupt polling for the XHCI controller");
 SYSCTL_INT(_hw_usb_xhci, OID_AUTO, dma32, CTLFLAG_RWTUN,
     &xhcidma32, 0, "Set to only use 32-bit DMA for the XHCI controller");
 SYSCTL_INT(_hw_usb_xhci, OID_AUTO, ctlstep, CTLFLAG_RWTUN,
     &xhcictlstep, 0, "Set to enable control endpoint status stage stepping");
 #else
 #define	xhciroute 0
 #define	xhcidma32 0
 #define	xhcictlstep 0
 #endif
 
 #define	XHCI_INTR_ENDPT 1
 
 struct xhci_std_temp {
 	struct xhci_softc	*sc;
 	struct usb_page_cache	*pc;
 	struct xhci_td		*td;
 	struct xhci_td		*td_next;
 	uint32_t		len;
 	uint32_t		offset;
 	uint32_t		max_packet_size;
 	uint32_t		average;
 	uint16_t		isoc_delta;
 	uint16_t		isoc_frame;
 	uint8_t			shortpkt;
 	uint8_t			multishort;
 	uint8_t			last_frame;
 	uint8_t			trb_type;
 	uint8_t			direction;
 	uint8_t			tbc;
 	uint8_t			tlbpc;
 	uint8_t			step_td;
 	uint8_t			do_isoc_sync;
 };
 
 static void	xhci_do_poll(struct usb_bus *);
 static void	xhci_device_done(struct usb_xfer *, usb_error_t);
 static void	xhci_root_intr(struct xhci_softc *);
 static void	xhci_free_device_ext(struct usb_device *);
 static struct xhci_endpoint_ext *xhci_get_endpoint_ext(struct usb_device *,
 		    struct usb_endpoint_descriptor *);
 static usb_proc_callback_t xhci_configure_msg;
 static usb_error_t xhci_configure_device(struct usb_device *);
 static usb_error_t xhci_configure_endpoint(struct usb_device *,
 		   struct usb_endpoint_descriptor *, struct xhci_endpoint_ext *,
 		   uint16_t, uint8_t, uint8_t, uint8_t, uint16_t, uint16_t,
 		   uint8_t);
 static usb_error_t xhci_configure_mask(struct usb_device *,
 		    uint32_t, uint8_t);
 static usb_error_t xhci_cmd_evaluate_ctx(struct xhci_softc *,
 		    uint64_t, uint8_t);
 static void xhci_endpoint_doorbell(struct usb_xfer *);
 static void xhci_ctx_set_le32(struct xhci_softc *sc, volatile uint32_t *ptr, uint32_t val);
 static uint32_t xhci_ctx_get_le32(struct xhci_softc *sc, volatile uint32_t *ptr);
 static void xhci_ctx_set_le64(struct xhci_softc *sc, volatile uint64_t *ptr, uint64_t val);
 #ifdef USB_DEBUG
 static uint64_t xhci_ctx_get_le64(struct xhci_softc *sc, volatile uint64_t *ptr);
 #endif
 
 static const struct usb_bus_methods xhci_bus_methods;
 
 #ifdef USB_DEBUG
 static void
 xhci_dump_trb(struct xhci_trb *trb)
 {
 	DPRINTFN(5, "trb = %p\n", trb);
 	DPRINTFN(5, "qwTrb0 = 0x%016llx\n", (long long)le64toh(trb->qwTrb0));
 	DPRINTFN(5, "dwTrb2 = 0x%08x\n", le32toh(trb->dwTrb2));
 	DPRINTFN(5, "dwTrb3 = 0x%08x\n", le32toh(trb->dwTrb3));
 }
 
 static void
 xhci_dump_endpoint(struct xhci_softc *sc, struct xhci_endp_ctx *pep)
 {
 	DPRINTFN(5, "pep = %p\n", pep);
 	DPRINTFN(5, "dwEpCtx0=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx0));
 	DPRINTFN(5, "dwEpCtx1=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx1));
 	DPRINTFN(5, "qwEpCtx2=0x%016llx\n", (long long)xhci_ctx_get_le64(sc, &pep->qwEpCtx2));
 	DPRINTFN(5, "dwEpCtx4=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx4));
 	DPRINTFN(5, "dwEpCtx5=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx5));
 	DPRINTFN(5, "dwEpCtx6=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx6));
 	DPRINTFN(5, "dwEpCtx7=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx7));
 }
 
 static void
 xhci_dump_device(struct xhci_softc *sc, struct xhci_slot_ctx *psl)
 {
 	DPRINTFN(5, "psl = %p\n", psl);
 	DPRINTFN(5, "dwSctx0=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx0));
 	DPRINTFN(5, "dwSctx1=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx1));
 	DPRINTFN(5, "dwSctx2=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx2));
 	DPRINTFN(5, "dwSctx3=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx3));
 }
 #endif
 
 uint8_t
 xhci_use_polling(void)
 {
 #ifdef USB_DEBUG
 	return (xhcipolling != 0);
 #else
 	return (0);
 #endif
 }
 
 static void
 xhci_iterate_hw_softc(struct usb_bus *bus, usb_bus_mem_sub_cb_t *cb)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(bus);
 	uint16_t i;
 
 	cb(bus, &sc->sc_hw.root_pc, &sc->sc_hw.root_pg,
 	   sizeof(struct xhci_hw_root), XHCI_PAGE_SIZE);
 
 	cb(bus, &sc->sc_hw.ctx_pc, &sc->sc_hw.ctx_pg,
 	   sizeof(struct xhci_dev_ctx_addr), XHCI_PAGE_SIZE);
 
 	for (i = 0; i != sc->sc_noscratch; i++) {
 		cb(bus, &sc->sc_hw.scratch_pc[i], &sc->sc_hw.scratch_pg[i],
 		    XHCI_PAGE_SIZE, XHCI_PAGE_SIZE);
 	}
 }
 
 static void
 xhci_ctx_set_le32(struct xhci_softc *sc, volatile uint32_t *ptr, uint32_t val)
 {
 	if (sc->sc_ctx_is_64_byte) {
 		uint32_t offset;
 		/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
 		/* all contexts are initially 32-bytes */
 		offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
 		ptr = (volatile uint32_t *)(((volatile uint8_t *)ptr) + offset);
 	}
 	*ptr = htole32(val);
 }
 
 static uint32_t
 xhci_ctx_get_le32(struct xhci_softc *sc, volatile uint32_t *ptr)
 {
 	if (sc->sc_ctx_is_64_byte) {
 		uint32_t offset;
 		/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
 		/* all contexts are initially 32-bytes */
 		offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
 		ptr = (volatile uint32_t *)(((volatile uint8_t *)ptr) + offset);
 	}
 	return (le32toh(*ptr));
 }
 
 static void
 xhci_ctx_set_le64(struct xhci_softc *sc, volatile uint64_t *ptr, uint64_t val)
 {
 	if (sc->sc_ctx_is_64_byte) {
 		uint32_t offset;
 		/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
 		/* all contexts are initially 32-bytes */
 		offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
 		ptr = (volatile uint64_t *)(((volatile uint8_t *)ptr) + offset);
 	}
 	*ptr = htole64(val);
 }
 
 #ifdef USB_DEBUG
 static uint64_t
 xhci_ctx_get_le64(struct xhci_softc *sc, volatile uint64_t *ptr)
 {
 	if (sc->sc_ctx_is_64_byte) {
 		uint32_t offset;
 		/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
 		/* all contexts are initially 32-bytes */
 		offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
 		ptr = (volatile uint64_t *)(((volatile uint8_t *)ptr) + offset);
 	}
 	return (le64toh(*ptr));
 }
 #endif
 
 static int
 xhci_reset_command_queue_locked(struct xhci_softc *sc)
 {
 	struct usb_page_search buf_res;
 	struct xhci_hw_root *phwr;
 	uint64_t addr;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	temp = XREAD4(sc, oper, XHCI_CRCR_LO);
 	if (temp & XHCI_CRCR_LO_CRR) {
 		DPRINTF("Command ring running\n");
 		temp &= ~(XHCI_CRCR_LO_CS | XHCI_CRCR_LO_CA);
 
 		/*
 		 * Try to abort the last command as per section
 		 * 4.6.1.2 "Aborting a Command" of the XHCI
 		 * specification:
 		 */
 
 		/* stop and cancel */
 		XWRITE4(sc, oper, XHCI_CRCR_LO, temp | XHCI_CRCR_LO_CS);
 		XWRITE4(sc, oper, XHCI_CRCR_HI, 0);
 
 		XWRITE4(sc, oper, XHCI_CRCR_LO, temp | XHCI_CRCR_LO_CA);
 		XWRITE4(sc, oper, XHCI_CRCR_HI, 0);
 
  		/* wait 250ms */
  		usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 4);
 
 		/* check if command ring is still running */
 		temp = XREAD4(sc, oper, XHCI_CRCR_LO);
 		if (temp & XHCI_CRCR_LO_CRR) {
 			DPRINTF("Comand ring still running\n");
 			return (USB_ERR_IOERROR);
 		}
 	}
 
 	/* reset command ring */
 	sc->sc_command_ccs = 1;
 	sc->sc_command_idx = 0;
 
 	usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);
 
 	/* set up command ring control base address */
 	addr = buf_res.physaddr;
 	phwr = buf_res.buffer;
 	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[0];
 
 	DPRINTF("CRCR=0x%016llx\n", (unsigned long long)addr);
 
 	memset(phwr->hwr_commands, 0, sizeof(phwr->hwr_commands));
 	phwr->hwr_commands[XHCI_MAX_COMMANDS - 1].qwTrb0 = htole64(addr);
 
 	usb_pc_cpu_flush(&sc->sc_hw.root_pc);
 
 	XWRITE4(sc, oper, XHCI_CRCR_LO, ((uint32_t)addr) | XHCI_CRCR_LO_RCS);
 	XWRITE4(sc, oper, XHCI_CRCR_HI, (uint32_t)(addr >> 32));
 
 	return (0);
 }
 
 usb_error_t
 xhci_start_controller(struct xhci_softc *sc)
 {
 	struct usb_page_search buf_res;
 	struct xhci_hw_root *phwr;
 	struct xhci_dev_ctx_addr *pdctxa;
 	usb_error_t err;
 	uint64_t addr;
 	uint32_t temp;
 	uint16_t i;
 
 	DPRINTF("\n");
 
 	sc->sc_event_ccs = 1;
 	sc->sc_event_idx = 0;
 	sc->sc_command_ccs = 1;
 	sc->sc_command_idx = 0;
 
 	err = xhci_reset_controller(sc);
 	if (err)
 		return (err);
 
 	/* set up number of device slots */
 	DPRINTF("CONFIG=0x%08x -> 0x%08x\n",
 	    XREAD4(sc, oper, XHCI_CONFIG), sc->sc_noslot);
 
 	XWRITE4(sc, oper, XHCI_CONFIG, sc->sc_noslot);
 
 	temp = XREAD4(sc, oper, XHCI_USBSTS);
 
 	/* clear interrupts */
 	XWRITE4(sc, oper, XHCI_USBSTS, temp);
 	/* disable all device notifications */
 	XWRITE4(sc, oper, XHCI_DNCTRL, 0);
 
 	/* set up device context base address */
 	usbd_get_page(&sc->sc_hw.ctx_pc, 0, &buf_res);
 	pdctxa = buf_res.buffer;
 	memset(pdctxa, 0, sizeof(*pdctxa));
 
 	addr = buf_res.physaddr;
 	addr += (uintptr_t)&((struct xhci_dev_ctx_addr *)0)->qwSpBufPtr[0];
 
 	/* slot 0 points to the table of scratchpad pointers */
 	pdctxa->qwBaaDevCtxAddr[0] = htole64(addr);
 
 	for (i = 0; i != sc->sc_noscratch; i++) {
 		struct usb_page_search buf_scp;
 		usbd_get_page(&sc->sc_hw.scratch_pc[i], 0, &buf_scp);
 		pdctxa->qwSpBufPtr[i] = htole64((uint64_t)buf_scp.physaddr);
 	}
 
 	addr = buf_res.physaddr;
 
 	XWRITE4(sc, oper, XHCI_DCBAAP_LO, (uint32_t)addr);
 	XWRITE4(sc, oper, XHCI_DCBAAP_HI, (uint32_t)(addr >> 32));
 	XWRITE4(sc, oper, XHCI_DCBAAP_LO, (uint32_t)addr);
 	XWRITE4(sc, oper, XHCI_DCBAAP_HI, (uint32_t)(addr >> 32));
 
 	/* set up event table size */
 	DPRINTF("ERSTSZ=0x%08x -> 0x%08x\n",
 	    XREAD4(sc, runt, XHCI_ERSTSZ(0)), sc->sc_erst_max);
 
 	XWRITE4(sc, runt, XHCI_ERSTSZ(0), XHCI_ERSTS_SET(sc->sc_erst_max));
 
 	/* set up interrupt rate */
 	XWRITE4(sc, runt, XHCI_IMOD(0), sc->sc_imod_default);
 
 	usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);
 
 	phwr = buf_res.buffer;
 	addr = buf_res.physaddr;
 	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_events[0];
 
 	/* reset hardware root structure */
 	memset(phwr, 0, sizeof(*phwr));
 
 	phwr->hwr_ring_seg[0].qwEvrsTablePtr = htole64(addr);
 	phwr->hwr_ring_seg[0].dwEvrsTableSize = htole32(XHCI_MAX_EVENTS);
 
 	DPRINTF("ERDP(0)=0x%016llx\n", (unsigned long long)addr);
 
 	XWRITE4(sc, runt, XHCI_ERDP_LO(0), (uint32_t)addr);
 	XWRITE4(sc, runt, XHCI_ERDP_HI(0), (uint32_t)(addr >> 32));
 
 	addr = buf_res.physaddr;
 
 	DPRINTF("ERSTBA(0)=0x%016llx\n", (unsigned long long)addr);
 
 	XWRITE4(sc, runt, XHCI_ERSTBA_LO(0), (uint32_t)addr);
 	XWRITE4(sc, runt, XHCI_ERSTBA_HI(0), (uint32_t)(addr >> 32));
 
 	/* set up interrupter registers */
 	temp = XREAD4(sc, runt, XHCI_IMAN(0));
 	temp |= XHCI_IMAN_INTR_ENA;
 	XWRITE4(sc, runt, XHCI_IMAN(0), temp);
 
 	/* set up command ring control base address */
 	addr = buf_res.physaddr;
 	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[0];
 
 	DPRINTF("CRCR=0x%016llx\n", (unsigned long long)addr);
 
 	XWRITE4(sc, oper, XHCI_CRCR_LO, ((uint32_t)addr) | XHCI_CRCR_LO_RCS);
 	XWRITE4(sc, oper, XHCI_CRCR_HI, (uint32_t)(addr >> 32));
 
 	phwr->hwr_commands[XHCI_MAX_COMMANDS - 1].qwTrb0 = htole64(addr);
 
 	usb_bus_mem_flush_all(&sc->sc_bus, &xhci_iterate_hw_softc);
 
 	/* Go! */
 	XWRITE4(sc, oper, XHCI_USBCMD, XHCI_CMD_RS |
 	    XHCI_CMD_INTE | XHCI_CMD_HSEE);
 
 	for (i = 0; i != 100; i++) {
 		usb_pause_mtx(NULL, hz / 100);
 		temp = XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_HCH;
 		if (!temp)
 			break;
 	}
 	if (temp) {
 		XWRITE4(sc, oper, XHCI_USBCMD, 0);
 		device_printf(sc->sc_bus.parent, "Run timeout.\n");
 		return (USB_ERR_IOERROR);
 	}
 
 	/* catch any lost interrupts */
 	xhci_do_poll(&sc->sc_bus);
 
 	if (sc->sc_port_route != NULL) {
 		/* Route all ports to the XHCI by default */
 		sc->sc_port_route(sc->sc_bus.parent,
 		    ~xhciroute, xhciroute);
 	}
 	return (0);
 }
 
 usb_error_t
 xhci_halt_controller(struct xhci_softc *sc)
 {
 	uint32_t temp;
 	uint16_t i;
 
 	DPRINTF("\n");
 
 	sc->sc_capa_off = 0;
 	sc->sc_oper_off = XREAD1(sc, capa, XHCI_CAPLENGTH);
 	sc->sc_runt_off = XREAD4(sc, capa, XHCI_RTSOFF) & ~0xF;
 	sc->sc_door_off = XREAD4(sc, capa, XHCI_DBOFF) & ~0x3;
 
 	/* Halt controller */
 	XWRITE4(sc, oper, XHCI_USBCMD, 0);
 
 	for (i = 0; i != 100; i++) {
 		usb_pause_mtx(NULL, hz / 100);
 		temp = XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_HCH;
 		if (temp)
 			break;
 	}
 
 	if (!temp) {
 		device_printf(sc->sc_bus.parent, "Controller halt timeout.\n");
 		return (USB_ERR_IOERROR);
 	}
 	return (0);
 }
 
 usb_error_t
 xhci_reset_controller(struct xhci_softc *sc)
 {
 	uint32_t temp = 0;
 	uint16_t i;
 
 	DPRINTF("\n");
 
 	/* Reset controller */
 	XWRITE4(sc, oper, XHCI_USBCMD, XHCI_CMD_HCRST);
 
 	for (i = 0; i != 100; i++) {
 		usb_pause_mtx(NULL, hz / 100);
 		temp = (XREAD4(sc, oper, XHCI_USBCMD) & XHCI_CMD_HCRST) |
 		    (XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_CNR);
 		if (!temp)
 			break;
 	}
 
 	if (temp) {
 		device_printf(sc->sc_bus.parent, "Controller "
 		    "reset timeout.\n");
 		return (USB_ERR_IOERROR);
 	}
 	return (0);
 }
 
 usb_error_t
 xhci_init(struct xhci_softc *sc, device_t self, uint8_t dma32)
 {
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	/* initialize some bus fields */
 	sc->sc_bus.parent = self;
 
 	/* set the bus revision */
 	sc->sc_bus.usbrev = USB_REV_3_0;
 
 	/* set up the bus struct */
 	sc->sc_bus.methods = &xhci_bus_methods;
 
 	/* set up devices array */
 	sc->sc_bus.devices = sc->sc_devices;
 	sc->sc_bus.devices_max = XHCI_MAX_DEVICES;
 
 	/* set default cycle state in case of early interrupts */
 	sc->sc_event_ccs = 1;
 	sc->sc_command_ccs = 1;
 
 	/* set up bus space offsets */
 	sc->sc_capa_off = 0;
 	sc->sc_oper_off = XREAD1(sc, capa, XHCI_CAPLENGTH);
 	sc->sc_runt_off = XREAD4(sc, capa, XHCI_RTSOFF) & ~0x1F;
 	sc->sc_door_off = XREAD4(sc, capa, XHCI_DBOFF) & ~0x3;
 
 	DPRINTF("CAPLENGTH=0x%x\n", sc->sc_oper_off);
 	DPRINTF("RUNTIMEOFFSET=0x%x\n", sc->sc_runt_off);
 	DPRINTF("DOOROFFSET=0x%x\n", sc->sc_door_off);
 
 	DPRINTF("xHCI version = 0x%04x\n", XREAD2(sc, capa, XHCI_HCIVERSION));
 
 	if (!(XREAD4(sc, oper, XHCI_PAGESIZE) & XHCI_PAGESIZE_4K)) {
 		device_printf(sc->sc_bus.parent, "Controller does "
 		    "not support 4K page size.\n");
 		return (ENXIO);
 	}
 
 	temp = XREAD4(sc, capa, XHCI_HCSPARAMS0);
 
 	DPRINTF("HCS0 = 0x%08x\n", temp);
 
 	/* set up context size */
 	if (XHCI_HCS0_CSZ(temp)) {
 		sc->sc_ctx_is_64_byte = 1;
 	} else {
 		sc->sc_ctx_is_64_byte = 0;
 	}
 
 	/* get DMA bits */
 	sc->sc_bus.dma_bits = (XHCI_HCS0_AC64(temp) &&
 	    xhcidma32 == 0 && dma32 == 0) ? 64 : 32;
 
 	device_printf(self, "%d bytes context size, %d-bit DMA\n",
 	    sc->sc_ctx_is_64_byte ? 64 : 32, (int)sc->sc_bus.dma_bits);
 
+	/* enable 64Kbyte control endpoint quirk */
+	sc->sc_bus.control_ep_quirk = 1;
+
 	temp = XREAD4(sc, capa, XHCI_HCSPARAMS1);
 
 	/* get number of device slots */
 	sc->sc_noport = XHCI_HCS1_N_PORTS(temp);
 
 	if (sc->sc_noport == 0) {
 		device_printf(sc->sc_bus.parent, "Invalid number "
 		    "of ports: %u\n", sc->sc_noport);
 		return (ENXIO);
 	}
 
 	sc->sc_noport = sc->sc_noport;
 	sc->sc_noslot = XHCI_HCS1_DEVSLOT_MAX(temp);
 
 	DPRINTF("Max slots: %u\n", sc->sc_noslot);
 
 	if (sc->sc_noslot > XHCI_MAX_DEVICES)
 		sc->sc_noslot = XHCI_MAX_DEVICES;
 
 	temp = XREAD4(sc, capa, XHCI_HCSPARAMS2);
 
 	DPRINTF("HCS2=0x%08x\n", temp);
 
 	/* get number of scratchpads */
 	sc->sc_noscratch = XHCI_HCS2_SPB_MAX(temp);
 
 	if (sc->sc_noscratch > XHCI_MAX_SCRATCHPADS) {
 		device_printf(sc->sc_bus.parent, "XHCI request "
 		    "too many scratchpads\n");
 		return (ENOMEM);
 	}
 
 	DPRINTF("Max scratch: %u\n", sc->sc_noscratch);
 
 	/* get event table size */
 	sc->sc_erst_max = 1U << XHCI_HCS2_ERST_MAX(temp);
 	if (sc->sc_erst_max > XHCI_MAX_RSEG)
 		sc->sc_erst_max = XHCI_MAX_RSEG;
 
 	temp = XREAD4(sc, capa, XHCI_HCSPARAMS3);
 
 	/* get maximum exit latency */
 	sc->sc_exit_lat_max = XHCI_HCS3_U1_DEL(temp) +
 	    XHCI_HCS3_U2_DEL(temp) + 250 /* us */;
 
 	/* Check if we should use the default IMOD value. */
 	if (sc->sc_imod_default == 0)
 		sc->sc_imod_default = XHCI_IMOD_DEFAULT;
 
 	/* get all DMA memory */
 	if (usb_bus_mem_alloc_all(&sc->sc_bus,
 	    USB_GET_DMA_TAG(self), &xhci_iterate_hw_softc)) {
 		return (ENOMEM);
 	}
 
 	/* set up command queue mutex and condition varible */
 	cv_init(&sc->sc_cmd_cv, "CMDQ");
 	sx_init(&sc->sc_cmd_sx, "CMDQ lock");
 
 	sc->sc_config_msg[0].hdr.pm_callback = &xhci_configure_msg;
 	sc->sc_config_msg[0].bus = &sc->sc_bus;
 	sc->sc_config_msg[1].hdr.pm_callback = &xhci_configure_msg;
 	sc->sc_config_msg[1].bus = &sc->sc_bus;
 
 	return (0);
 }
 
 void
 xhci_uninit(struct xhci_softc *sc)
 {
 	/*
 	 * NOTE: At this point the control transfer process is gone
 	 * and "xhci_configure_msg" is no longer called. Consequently
 	 * waiting for the configuration messages to complete is not
 	 * needed.
 	 */
 	usb_bus_mem_free_all(&sc->sc_bus, &xhci_iterate_hw_softc);
 
 	cv_destroy(&sc->sc_cmd_cv);
 	sx_destroy(&sc->sc_cmd_sx);
 }
 
 static void
 xhci_set_hw_power_sleep(struct usb_bus *bus, uint32_t state)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(bus);
 
 	switch (state) {
 	case USB_HW_POWER_SUSPEND:
 		DPRINTF("Stopping the XHCI\n");
 		xhci_halt_controller(sc);
 		xhci_reset_controller(sc);
 		break;
 	case USB_HW_POWER_SHUTDOWN:
 		DPRINTF("Stopping the XHCI\n");
 		xhci_halt_controller(sc);
 		xhci_reset_controller(sc);
 		break;
 	case USB_HW_POWER_RESUME:
 		DPRINTF("Starting the XHCI\n");
 		xhci_start_controller(sc);
 		break;
 	default:
 		break;
 	}
 }
 
 static usb_error_t
 xhci_generic_done_sub(struct usb_xfer *xfer)
 {
 	struct xhci_td *td;
 	struct xhci_td *td_alt_next;
 	uint32_t len;
 	uint8_t status;
 
 	td = xfer->td_transfer_cache;
 	td_alt_next = td->alt_next;
 
 	if (xfer->aframes != xfer->nframes)
 		usbd_xfer_set_frame_len(xfer, xfer->aframes, 0);
 
 	while (1) {
 
 		usb_pc_cpu_invalidate(td->page_cache);
 
 		status = td->status;
 		len = td->remainder;
 
 		DPRINTFN(4, "xfer=%p[%u/%u] rem=%u/%u status=%u\n",
 		    xfer, (unsigned int)xfer->aframes,
 		    (unsigned int)xfer->nframes,
 		    (unsigned int)len, (unsigned int)td->len,
 		    (unsigned int)status);
 
 		/*
 	         * Verify the status length and
 		 * add the length to "frlengths[]":
 	         */
 		if (len > td->len) {
 			/* should not happen */
 			DPRINTF("Invalid status length, "
 			    "0x%04x/0x%04x bytes\n", len, td->len);
 			status = XHCI_TRB_ERROR_LENGTH;
 		} else if (xfer->aframes != xfer->nframes) {
 			xfer->frlengths[xfer->aframes] += td->len - len;
 		}
 		/* Check for last transfer */
 		if (((void *)td) == xfer->td_transfer_last) {
 			td = NULL;
 			break;
 		}
 		/* Check for transfer error */
 		if (status != XHCI_TRB_ERROR_SHORT_PKT &&
 		    status != XHCI_TRB_ERROR_SUCCESS) {
 			/* the transfer is finished */
 			td = NULL;
 			break;
 		}
 		/* Check for short transfer */
 		if (len > 0) {
 			if (xfer->flags_int.short_frames_ok || 
 			    xfer->flags_int.isochronous_xfr ||
 			    xfer->flags_int.control_xfr) {
 				/* follow alt next */
 				td = td->alt_next;
 			} else {
 				/* the transfer is finished */
 				td = NULL;
 			}
 			break;
 		}
 		td = td->obj_next;
 
 		if (td->alt_next != td_alt_next) {
 			/* this USB frame is complete */
 			break;
 		}
 	}
 
 	/* update transfer cache */
 
 	xfer->td_transfer_cache = td;
 
 	return ((status == XHCI_TRB_ERROR_STALL) ? USB_ERR_STALLED : 
 	    (status != XHCI_TRB_ERROR_SHORT_PKT && 
 	    status != XHCI_TRB_ERROR_SUCCESS) ? USB_ERR_IOERROR :
 	    USB_ERR_NORMAL_COMPLETION);
 }
 
 static void
 xhci_generic_done(struct usb_xfer *xfer)
 {
 	usb_error_t err = 0;
 
 	DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
 	    xfer, xfer->endpoint);
 
 	/* reset scanner */
 
 	xfer->td_transfer_cache = xfer->td_transfer_first;
 
 	if (xfer->flags_int.control_xfr) {
 
 		if (xfer->flags_int.control_hdr)
 			err = xhci_generic_done_sub(xfer);
 
 		xfer->aframes = 1;
 
 		if (xfer->td_transfer_cache == NULL)
 			goto done;
 	}
 
 	while (xfer->aframes != xfer->nframes) {
 
 		err = xhci_generic_done_sub(xfer);
 		xfer->aframes++;
 
 		if (xfer->td_transfer_cache == NULL)
 			goto done;
 	}
 
 	if (xfer->flags_int.control_xfr &&
 	    !xfer->flags_int.control_act)
 		err = xhci_generic_done_sub(xfer);
 done:
 	/* transfer is complete */
 	xhci_device_done(xfer, err);
 }
 
 static void
 xhci_activate_transfer(struct usb_xfer *xfer)
 {
 	struct xhci_td *td;
 
 	td = xfer->td_transfer_cache;
 
 	usb_pc_cpu_invalidate(td->page_cache);
 
 	if (!(td->td_trb[0].dwTrb3 & htole32(XHCI_TRB_3_CYCLE_BIT))) {
 
 		/* activate the transfer */
 
 		td->td_trb[0].dwTrb3 |= htole32(XHCI_TRB_3_CYCLE_BIT);
 		usb_pc_cpu_flush(td->page_cache);
 
 		xhci_endpoint_doorbell(xfer);
 	}
 }
 
 static void
 xhci_skip_transfer(struct usb_xfer *xfer)
 {
 	struct xhci_td *td;
 	struct xhci_td *td_last;
 
 	td = xfer->td_transfer_cache;
 	td_last = xfer->td_transfer_last;
 
 	td = td->alt_next;
 
 	usb_pc_cpu_invalidate(td->page_cache);
 
 	if (!(td->td_trb[0].dwTrb3 & htole32(XHCI_TRB_3_CYCLE_BIT))) {
 
 		usb_pc_cpu_invalidate(td_last->page_cache);
 
 		/* copy LINK TRB to current waiting location */
 
 		td->td_trb[0].qwTrb0 = td_last->td_trb[td_last->ntrb].qwTrb0;
 		td->td_trb[0].dwTrb2 = td_last->td_trb[td_last->ntrb].dwTrb2;
 		usb_pc_cpu_flush(td->page_cache);
 
 		td->td_trb[0].dwTrb3 = td_last->td_trb[td_last->ntrb].dwTrb3;
 		usb_pc_cpu_flush(td->page_cache);
 
 		xhci_endpoint_doorbell(xfer);
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	xhci_check_transfer
  *------------------------------------------------------------------------*/
 static void
 xhci_check_transfer(struct xhci_softc *sc, struct xhci_trb *trb)
 {
 	struct xhci_endpoint_ext *pepext;
 	int64_t offset;
 	uint64_t td_event;
 	uint32_t temp;
 	uint32_t remainder;
 	uint16_t stream_id = 0;
 	uint16_t i;
 	uint8_t status;
 	uint8_t halted;
 	uint8_t epno;
 	uint8_t index;
 
 	/* decode TRB */
 	td_event = le64toh(trb->qwTrb0);
 	temp = le32toh(trb->dwTrb2);
 
 	remainder = XHCI_TRB_2_REM_GET(temp);
 	status = XHCI_TRB_2_ERROR_GET(temp);
 
 	temp = le32toh(trb->dwTrb3);
 	epno = XHCI_TRB_3_EP_GET(temp);
 	index = XHCI_TRB_3_SLOT_GET(temp);
 
 	/* check if error means halted */
 	halted = (status != XHCI_TRB_ERROR_SHORT_PKT &&
 	    status != XHCI_TRB_ERROR_SUCCESS);
 
 	DPRINTF("slot=%u epno=%u remainder=%u status=%u\n",
 	    index, epno, remainder, status);
 
 	if (index > sc->sc_noslot) {
 		DPRINTF("Invalid slot.\n");
 		return;
 	}
 
 	if ((epno == 0) || (epno >= XHCI_MAX_ENDPOINTS)) {
 		DPRINTF("Invalid endpoint.\n");
 		return;
 	}
 
 	pepext = &sc->sc_hw.devs[index].endp[epno];
 
 	/* try to find the USB transfer that generated the event */
 	for (i = 0;; i++) {
 		struct usb_xfer *xfer;
 		struct xhci_td *td;
 
 		if (i == (XHCI_MAX_TRANSFERS - 1)) {
 			if (pepext->trb_ep_mode != USB_EP_MODE_STREAMS ||
 			    stream_id == (XHCI_MAX_STREAMS - 1))
 				break;
 			stream_id++;
 			i = 0;
 			DPRINTFN(5, "stream_id=%u\n", stream_id);
 		}
 
 		xfer = pepext->xfer[i + (XHCI_MAX_TRANSFERS * stream_id)];
 		if (xfer == NULL)
 			continue;
 
 		td = xfer->td_transfer_cache;
 
 		DPRINTFN(5, "Checking if 0x%016llx == (0x%016llx .. 0x%016llx)\n",
 			(long long)td_event,
 			(long long)td->td_self,
 			(long long)td->td_self + sizeof(td->td_trb));
 
 		/*
 		 * NOTE: Some XHCI implementations might not trigger
 		 * an event on the last LINK TRB so we need to
 		 * consider both the last and second last event
 		 * address as conditions for a successful transfer.
 		 *
 		 * NOTE: We assume that the XHCI will only trigger one
 		 * event per chain of TRBs.
 		 */
 
 		offset = td_event - td->td_self;
 
 		if (offset >= 0 &&
 		    offset < (int64_t)sizeof(td->td_trb)) {
 
 			usb_pc_cpu_invalidate(td->page_cache);
 
 			/* compute rest of remainder, if any */
 			for (i = (offset / 16) + 1; i < td->ntrb; i++) {
 				temp = le32toh(td->td_trb[i].dwTrb2);
 				remainder += XHCI_TRB_2_BYTES_GET(temp);
 			}
 
 			DPRINTFN(5, "New remainder: %u\n", remainder);
 
 			/* clear isochronous transfer errors */
 			if (xfer->flags_int.isochronous_xfr) {
 				if (halted) {
 					halted = 0;
 					status = XHCI_TRB_ERROR_SUCCESS;
 					remainder = td->len;
 				}
 			}
 
 			/* "td->remainder" is verified later */
 			td->remainder = remainder;
 			td->status = status;
 
 			usb_pc_cpu_flush(td->page_cache);
 
 			/*
 			 * 1) Last transfer descriptor makes the
 			 * transfer done
 			 */
 			if (((void *)td) == xfer->td_transfer_last) {
 				DPRINTF("TD is last\n");
 				xhci_generic_done(xfer);
 				break;
 			}
 
 			/*
 			 * 2) Any kind of error makes the transfer
 			 * done
 			 */
 			if (halted) {
 				DPRINTF("TD has I/O error\n");
 				xhci_generic_done(xfer);
 				break;
 			}
 
 			/*
 			 * 3) If there is no alternate next transfer,
 			 * a short packet also makes the transfer done
 			 */
 			if (td->remainder > 0) {
 				if (td->alt_next == NULL) {
 					DPRINTF(
 					    "short TD has no alternate next\n");
 					xhci_generic_done(xfer);
 					break;
 				}
 				DPRINTF("TD has short pkt\n");
 				if (xfer->flags_int.short_frames_ok ||
 				    xfer->flags_int.isochronous_xfr ||
 				    xfer->flags_int.control_xfr) {
 					/* follow the alt next */
 					xfer->td_transfer_cache = td->alt_next;
 					xhci_activate_transfer(xfer);
 					break;
 				}
 				xhci_skip_transfer(xfer);
 				xhci_generic_done(xfer);
 				break;
 			}
 
 			/*
 			 * 4) Transfer complete - go to next TD
 			 */
 			DPRINTF("Following next TD\n");
 			xfer->td_transfer_cache = td->obj_next;
 			xhci_activate_transfer(xfer);
 			break;		/* there should only be one match */
 		}
 	}
 }
 
 static int
 xhci_check_command(struct xhci_softc *sc, struct xhci_trb *trb)
 {
 	if (sc->sc_cmd_addr == trb->qwTrb0) {
 		DPRINTF("Received command event\n");
 		sc->sc_cmd_result[0] = trb->dwTrb2;
 		sc->sc_cmd_result[1] = trb->dwTrb3;
 		cv_signal(&sc->sc_cmd_cv);
 		return (1);	/* command match */
 	}
 	return (0);
 }
 
 static int
 xhci_interrupt_poll(struct xhci_softc *sc)
 {
 	struct usb_page_search buf_res;
 	struct xhci_hw_root *phwr;
 	uint64_t addr;
 	uint32_t temp;
 	int retval = 0;
 	uint16_t i;
 	uint8_t event;
 	uint8_t j;
 	uint8_t k;
 	uint8_t t;
 
 	usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);
 
 	phwr = buf_res.buffer;
 
 	/* Receive any events */
 
 	usb_pc_cpu_invalidate(&sc->sc_hw.root_pc);
 
 	i = sc->sc_event_idx;
 	j = sc->sc_event_ccs;
 	t = 2;
 
 	while (1) {
 
 		temp = le32toh(phwr->hwr_events[i].dwTrb3);
 
 		k = (temp & XHCI_TRB_3_CYCLE_BIT) ? 1 : 0;
 
 		if (j != k)
 			break;
 
 		event = XHCI_TRB_3_TYPE_GET(temp);
 
 		DPRINTFN(10, "event[%u] = %u (0x%016llx 0x%08lx 0x%08lx)\n",
 		    i, event, (long long)le64toh(phwr->hwr_events[i].qwTrb0),
 		    (long)le32toh(phwr->hwr_events[i].dwTrb2),
 		    (long)le32toh(phwr->hwr_events[i].dwTrb3));
 
 		switch (event) {
 		case XHCI_TRB_EVENT_TRANSFER:
 			xhci_check_transfer(sc, &phwr->hwr_events[i]);
 			break;
 		case XHCI_TRB_EVENT_CMD_COMPLETE:
 			retval |= xhci_check_command(sc, &phwr->hwr_events[i]);
 			break;
 		default:
 			DPRINTF("Unhandled event = %u\n", event);
 			break;
 		}
 
 		i++;
 
 		if (i == XHCI_MAX_EVENTS) {
 			i = 0;
 			j ^= 1;
 
 			/* check for timeout */
 			if (!--t)
 				break;
 		}
 	}
 
 	sc->sc_event_idx = i;
 	sc->sc_event_ccs = j;
 
 	/*
 	 * NOTE: The Event Ring Dequeue Pointer Register is 64-bit
 	 * latched. That means to activate the register we need to
 	 * write both the low and high double word of the 64-bit
 	 * register.
 	 */
 
 	addr = buf_res.physaddr;
 	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_events[i];
 
 	/* try to clear busy bit */
 	addr |= XHCI_ERDP_LO_BUSY;
 
 	XWRITE4(sc, runt, XHCI_ERDP_LO(0), (uint32_t)addr);
 	XWRITE4(sc, runt, XHCI_ERDP_HI(0), (uint32_t)(addr >> 32));
 
 	return (retval);
 }
 
 static usb_error_t
 xhci_do_command(struct xhci_softc *sc, struct xhci_trb *trb, 
     uint16_t timeout_ms)
 {
 	struct usb_page_search buf_res;
 	struct xhci_hw_root *phwr;
 	uint64_t addr;
 	uint32_t temp;
 	uint8_t i;
 	uint8_t j;
 	uint8_t timeout = 0;
 	int err;
 
 	XHCI_CMD_ASSERT_LOCKED(sc);
 
 	/* get hardware root structure */
 
 	usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);
 
 	phwr = buf_res.buffer;
 
 	/* Queue command */
 
 	USB_BUS_LOCK(&sc->sc_bus);
 retry:
 	i = sc->sc_command_idx;
 	j = sc->sc_command_ccs;
 
 	DPRINTFN(10, "command[%u] = %u (0x%016llx, 0x%08lx, 0x%08lx)\n",
 	    i, XHCI_TRB_3_TYPE_GET(le32toh(trb->dwTrb3)),
 	    (long long)le64toh(trb->qwTrb0),
 	    (long)le32toh(trb->dwTrb2),
 	    (long)le32toh(trb->dwTrb3));
 
 	phwr->hwr_commands[i].qwTrb0 = trb->qwTrb0;
 	phwr->hwr_commands[i].dwTrb2 = trb->dwTrb2;
 
 	usb_pc_cpu_flush(&sc->sc_hw.root_pc);
 
 	temp = trb->dwTrb3;
 
 	if (j)
 		temp |= htole32(XHCI_TRB_3_CYCLE_BIT);
 	else
 		temp &= ~htole32(XHCI_TRB_3_CYCLE_BIT);
 
 	temp &= ~htole32(XHCI_TRB_3_TC_BIT);
 
 	phwr->hwr_commands[i].dwTrb3 = temp;
 
 	usb_pc_cpu_flush(&sc->sc_hw.root_pc);
 
 	addr = buf_res.physaddr;
 	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[i];
 
 	sc->sc_cmd_addr = htole64(addr);
 
 	i++;
 
 	if (i == (XHCI_MAX_COMMANDS - 1)) {
 
 		if (j) {
 			temp = htole32(XHCI_TRB_3_TC_BIT |
 			    XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK) |
 			    XHCI_TRB_3_CYCLE_BIT);
 		} else {
 			temp = htole32(XHCI_TRB_3_TC_BIT |
 			    XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
 		}
 
 		phwr->hwr_commands[i].dwTrb3 = temp;
 
 		usb_pc_cpu_flush(&sc->sc_hw.root_pc);
 
 		i = 0;
 		j ^= 1;
 	}
 
 	sc->sc_command_idx = i;
 	sc->sc_command_ccs = j;
 
 	XWRITE4(sc, door, XHCI_DOORBELL(0), 0);
 
 	err = cv_timedwait(&sc->sc_cmd_cv, &sc->sc_bus.bus_mtx,
 	    USB_MS_TO_TICKS(timeout_ms));
 
 	/*
 	 * In some error cases event interrupts are not generated.
 	 * Poll one time to see if the command has completed.
 	 */
 	if (err != 0 && xhci_interrupt_poll(sc) != 0) {
 		DPRINTF("Command was completed when polling\n");
 		err = 0;
 	}
 	if (err != 0) {
 		DPRINTF("Command timeout!\n");
 		/*
 		 * After some weeks of continuous operation, it has
 		 * been observed that the ASMedia Technology, ASM1042
 		 * SuperSpeed USB Host Controller can suddenly stop
 		 * accepting commands via the command queue. Try to
 		 * first reset the command queue. If that fails do a
 		 * host controller reset.
 		 */
 		if (timeout == 0 &&
 		    xhci_reset_command_queue_locked(sc) == 0) {
 			temp = le32toh(trb->dwTrb3);
 
 			/*
 			 * Avoid infinite XHCI reset loops if the set
 			 * address command fails to respond due to a
 			 * non-enumerating device:
 			 */
 			if (XHCI_TRB_3_TYPE_GET(temp) == XHCI_TRB_TYPE_ADDRESS_DEVICE &&
 			    (temp & XHCI_TRB_3_BSR_BIT) == 0) {
 				DPRINTF("Set address timeout\n");
 			} else {
 				timeout = 1;
 				goto retry;
 			}
 		} else {
 			DPRINTF("Controller reset!\n");
 			usb_bus_reset_async_locked(&sc->sc_bus);
 		}
 		err = USB_ERR_TIMEOUT;
 		trb->dwTrb2 = 0;
 		trb->dwTrb3 = 0;
 	} else {
 		temp = le32toh(sc->sc_cmd_result[0]);
 		if (XHCI_TRB_2_ERROR_GET(temp) != XHCI_TRB_ERROR_SUCCESS)
 			err = USB_ERR_IOERROR;
 
 		trb->dwTrb2 = sc->sc_cmd_result[0];
 		trb->dwTrb3 = sc->sc_cmd_result[1];
 	}
 
 	USB_BUS_UNLOCK(&sc->sc_bus);
 
 	return (err);
 }
 
 #if 0
 static usb_error_t
 xhci_cmd_nop(struct xhci_softc *sc)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = 0;
 	trb.dwTrb2 = 0;
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NOOP);
 
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 100 /* ms */));
 }
 #endif
 
 static usb_error_t
 xhci_cmd_enable_slot(struct xhci_softc *sc, uint8_t *pslot)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 	usb_error_t err;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = 0;
 	trb.dwTrb2 = 0;
 	trb.dwTrb3 = htole32(XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ENABLE_SLOT));
 
 	err = xhci_do_command(sc, &trb, 100 /* ms */);
 	if (err)
 		goto done;
 
 	temp = le32toh(trb.dwTrb3);
 
 	*pslot = XHCI_TRB_3_SLOT_GET(temp); 
 
 done:
 	return (err);
 }
 
 static usb_error_t
 xhci_cmd_disable_slot(struct xhci_softc *sc, uint8_t slot_id)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = 0;
 	trb.dwTrb2 = 0;
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_DISABLE_SLOT) |
 	    XHCI_TRB_3_SLOT_SET(slot_id);
 
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 100 /* ms */));
 }
 
 static usb_error_t
 xhci_cmd_set_address(struct xhci_softc *sc, uint64_t input_ctx,
     uint8_t bsr, uint8_t slot_id)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = htole64(input_ctx);
 	trb.dwTrb2 = 0;
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ADDRESS_DEVICE) |
 	    XHCI_TRB_3_SLOT_SET(slot_id);
 
 	if (bsr)
 		temp |= XHCI_TRB_3_BSR_BIT;
 
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 500 /* ms */));
 }
 
 static usb_error_t
 xhci_set_address(struct usb_device *udev, struct mtx *mtx, uint16_t address)
 {
 	struct usb_page_search buf_inp;
 	struct usb_page_search buf_dev;
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	struct xhci_hw_dev *hdev;
 	struct xhci_dev_ctx *pdev;
 	struct xhci_endpoint_ext *pepext;
 	uint32_t temp;
 	uint16_t mps;
 	usb_error_t err;
 	uint8_t index;
 
 	/* the root HUB case is not handled here */
 	if (udev->parent_hub == NULL)
 		return (USB_ERR_INVAL);
 
 	index = udev->controller_slot_id;
 
 	hdev = 	&sc->sc_hw.devs[index];
 
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 
 	XHCI_CMD_LOCK(sc);
 
 	switch (hdev->state) {
 	case XHCI_ST_DEFAULT:
 	case XHCI_ST_ENABLED:
 
 		hdev->state = XHCI_ST_ENABLED;
 
 		/* set configure mask to slot and EP0 */
 		xhci_configure_mask(udev, 3, 0);
 
 		/* configure input slot context structure */
 		err = xhci_configure_device(udev);
 
 		if (err != 0) {
 			DPRINTF("Could not configure device\n");
 			break;
 		}
 
 		/* configure input endpoint context structure */
 		switch (udev->speed) {
 		case USB_SPEED_LOW:
 		case USB_SPEED_FULL:
 			mps = 8;
 			break;
 		case USB_SPEED_HIGH:
 			mps = 64;
 			break;
 		default:
 			mps = 512;
 			break;
 		}
 
 		pepext = xhci_get_endpoint_ext(udev,
 		    &udev->ctrl_ep_desc);
 
 		/* ensure the control endpoint is setup again */
 		USB_BUS_LOCK(udev->bus);
 		pepext->trb_halted = 1;
 		pepext->trb_running = 0;
 		USB_BUS_UNLOCK(udev->bus);
 
 		err = xhci_configure_endpoint(udev,
 		    &udev->ctrl_ep_desc, pepext,
 		    0, 1, 1, 0, mps, mps, USB_EP_MODE_DEFAULT);
 
 		if (err != 0) {
 			DPRINTF("Could not configure default endpoint\n");
 			break;
 		}
 
 		/* execute set address command */
 		usbd_get_page(&hdev->input_pc, 0, &buf_inp);
 
 		err = xhci_cmd_set_address(sc, buf_inp.physaddr,
 		    (address == 0), index);
 
 		if (err != 0) {
 			temp = le32toh(sc->sc_cmd_result[0]);
 			if (address == 0 && sc->sc_port_route != NULL &&
 			    XHCI_TRB_2_ERROR_GET(temp) ==
 			    XHCI_TRB_ERROR_PARAMETER) {
 				/* LynxPoint XHCI - ports are not switchable */
 				/* Un-route all ports from the XHCI */
 				sc->sc_port_route(sc->sc_bus.parent, 0, ~0);
 			}
 			DPRINTF("Could not set address "
 			    "for slot %u.\n", index);
 			if (address != 0)
 				break;
 		}
 
 		/* update device address to new value */
 
 		usbd_get_page(&hdev->device_pc, 0, &buf_dev);
 		pdev = buf_dev.buffer;
 		usb_pc_cpu_invalidate(&hdev->device_pc);
 
 		temp = xhci_ctx_get_le32(sc, &pdev->ctx_slot.dwSctx3);
 		udev->address = XHCI_SCTX_3_DEV_ADDR_GET(temp);
 
 		/* update device state to new value */
 
 		if (address != 0)
 			hdev->state = XHCI_ST_ADDRESSED;
 		else
 			hdev->state = XHCI_ST_DEFAULT;
 		break;
 
 	default:
 		DPRINTF("Wrong state for set address.\n");
 		err = USB_ERR_IOERROR;
 		break;
 	}
 	XHCI_CMD_UNLOCK(sc);
 
 	if (mtx != NULL)
 		mtx_lock(mtx);
 
 	return (err);
 }
 
 static usb_error_t
 xhci_cmd_configure_ep(struct xhci_softc *sc, uint64_t input_ctx,
     uint8_t deconfigure, uint8_t slot_id)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = htole64(input_ctx);
 	trb.dwTrb2 = 0;
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_CONFIGURE_EP) |
 	    XHCI_TRB_3_SLOT_SET(slot_id);
 
 	if (deconfigure)
 		temp |= XHCI_TRB_3_DCEP_BIT;
 
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 100 /* ms */));
 }
 
 static usb_error_t
 xhci_cmd_evaluate_ctx(struct xhci_softc *sc, uint64_t input_ctx,
     uint8_t slot_id)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = htole64(input_ctx);
 	trb.dwTrb2 = 0;
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_EVALUATE_CTX) |
 	    XHCI_TRB_3_SLOT_SET(slot_id);
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 100 /* ms */));
 }
 
 static usb_error_t
 xhci_cmd_reset_ep(struct xhci_softc *sc, uint8_t preserve,
     uint8_t ep_id, uint8_t slot_id)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = 0;
 	trb.dwTrb2 = 0;
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_RESET_EP) |
 	    XHCI_TRB_3_SLOT_SET(slot_id) |
 	    XHCI_TRB_3_EP_SET(ep_id);
 
 	if (preserve)
 		temp |= XHCI_TRB_3_PRSV_BIT;
 
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 100 /* ms */));
 }
 
 static usb_error_t
 xhci_cmd_set_tr_dequeue_ptr(struct xhci_softc *sc, uint64_t dequeue_ptr,
     uint16_t stream_id, uint8_t ep_id, uint8_t slot_id)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = htole64(dequeue_ptr);
 
 	temp = XHCI_TRB_2_STREAM_SET(stream_id);
 	trb.dwTrb2 = htole32(temp);
 
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_SET_TR_DEQUEUE) |
 	    XHCI_TRB_3_SLOT_SET(slot_id) |
 	    XHCI_TRB_3_EP_SET(ep_id);
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 100 /* ms */));
 }
 
 static usb_error_t
 xhci_cmd_stop_ep(struct xhci_softc *sc, uint8_t suspend,
     uint8_t ep_id, uint8_t slot_id)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = 0;
 	trb.dwTrb2 = 0;
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_STOP_EP) |
 	    XHCI_TRB_3_SLOT_SET(slot_id) |
 	    XHCI_TRB_3_EP_SET(ep_id);
 
 	if (suspend)
 		temp |= XHCI_TRB_3_SUSP_EP_BIT;
 
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 100 /* ms */));
 }
 
 static usb_error_t
 xhci_cmd_reset_dev(struct xhci_softc *sc, uint8_t slot_id)
 {
 	struct xhci_trb trb;
 	uint32_t temp;
 
 	DPRINTF("\n");
 
 	trb.qwTrb0 = 0;
 	trb.dwTrb2 = 0;
 	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_RESET_DEVICE) |
 	    XHCI_TRB_3_SLOT_SET(slot_id);
 
 	trb.dwTrb3 = htole32(temp);
 
 	return (xhci_do_command(sc, &trb, 100 /* ms */));
 }
 
 /*------------------------------------------------------------------------*
  *	xhci_interrupt - XHCI interrupt handler
  *------------------------------------------------------------------------*/
 void
 xhci_interrupt(struct xhci_softc *sc)
 {
 	uint32_t status;
 	uint32_t temp;
 
 	USB_BUS_LOCK(&sc->sc_bus);
 
 	status = XREAD4(sc, oper, XHCI_USBSTS);
 
 	/* acknowledge interrupts, if any */
 	if (status != 0) {
 		XWRITE4(sc, oper, XHCI_USBSTS, status);
 		DPRINTFN(16, "real interrupt (status=0x%08x)\n", status);
 	}
 
 	temp = XREAD4(sc, runt, XHCI_IMAN(0));
 
 	/* force clearing of pending interrupts */
 	if (temp & XHCI_IMAN_INTR_PEND)
 		XWRITE4(sc, runt, XHCI_IMAN(0), temp);
  
 	/* check for event(s) */
 	xhci_interrupt_poll(sc);
 
 	if (status & (XHCI_STS_PCD | XHCI_STS_HCH |
 	    XHCI_STS_HSE | XHCI_STS_HCE)) {
 
 		if (status & XHCI_STS_PCD) {
 			xhci_root_intr(sc);
 		}
 
 		if (status & XHCI_STS_HCH) {
 			printf("%s: host controller halted\n",
 			    __FUNCTION__);
 		}
 
 		if (status & XHCI_STS_HSE) {
 			printf("%s: host system error\n",
 			    __FUNCTION__);
 		}
 
 		if (status & XHCI_STS_HCE) {
 			printf("%s: host controller error\n",
 			   __FUNCTION__);
 		}
 	}
 	USB_BUS_UNLOCK(&sc->sc_bus);
 }
 
 /*------------------------------------------------------------------------*
  *	xhci_timeout - XHCI timeout handler
  *------------------------------------------------------------------------*/
 static void
 xhci_timeout(void *arg)
 {
 	struct usb_xfer *xfer = arg;
 
 	DPRINTF("xfer=%p\n", xfer);
 
 	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
 
 	/* transfer is transferred */
 	xhci_device_done(xfer, USB_ERR_TIMEOUT);
 }
 
 static void
 xhci_do_poll(struct usb_bus *bus)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(bus);
 
 	USB_BUS_LOCK(&sc->sc_bus);
 	xhci_interrupt_poll(sc);
 	USB_BUS_UNLOCK(&sc->sc_bus);
 }
 
 static void
 xhci_setup_generic_chain_sub(struct xhci_std_temp *temp)
 {
 	struct usb_page_search buf_res;
 	struct xhci_td *td;
 	struct xhci_td *td_next;
 	struct xhci_td *td_alt_next;
 	struct xhci_td *td_first;
 	uint32_t buf_offset;
 	uint32_t average;
 	uint32_t len_old;
 	uint32_t npkt_off;
 	uint32_t dword;
 	uint8_t shortpkt_old;
 	uint8_t precompute;
 	uint8_t x;
 
 	td_alt_next = NULL;
 	buf_offset = 0;
 	shortpkt_old = temp->shortpkt;
 	len_old = temp->len;
 	npkt_off = 0;
 	precompute = 1;
 
 restart:
 
 	td = temp->td;
 	td_next = td_first = temp->td_next;
 
 	while (1) {
 
 		if (temp->len == 0) {
 
 			if (temp->shortpkt)
 				break;
 
 			/* send a Zero Length Packet, ZLP, last */
 
 			temp->shortpkt = 1;
 			average = 0;
 
 		} else {
 
 			average = temp->average;
 
 			if (temp->len < average) {
 				if (temp->len % temp->max_packet_size) {
 					temp->shortpkt = 1;
 				}
 				average = temp->len;
 			}
 		}
 
 		if (td_next == NULL)
 			panic("%s: out of XHCI transfer descriptors!", __FUNCTION__);
 
 		/* get next TD */
 
 		td = td_next;
 		td_next = td->obj_next;
 
 		/* check if we are pre-computing */
 
 		if (precompute) {
 
 			/* update remaining length */
 
 			temp->len -= average;
 
 			continue;
 		}
 		/* fill out current TD */
 
 		td->len = average;
 		td->remainder = 0;
 		td->status = 0;
 
 		/* update remaining length */
 
 		temp->len -= average;
 
 		/* reset TRB index */
 
 		x = 0;
 
 		if (temp->trb_type == XHCI_TRB_TYPE_SETUP_STAGE) {
 			/* immediate data */
 
 			if (average > 8)
 				average = 8;
 
 			td->td_trb[0].qwTrb0 = 0;
 
 			usbd_copy_out(temp->pc, temp->offset + buf_offset, 
 			   (uint8_t *)(uintptr_t)&td->td_trb[0].qwTrb0,
 			   average);
 
 			dword = XHCI_TRB_2_BYTES_SET(8) |
 			    XHCI_TRB_2_TDSZ_SET(0) |
 			    XHCI_TRB_2_IRQ_SET(0);
 
 			td->td_trb[0].dwTrb2 = htole32(dword);
 
 			dword = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_SETUP_STAGE) |
 			  XHCI_TRB_3_IDT_BIT | XHCI_TRB_3_CYCLE_BIT;
 
 			/* check wLength */
 			if (td->td_trb[0].qwTrb0 &
 			   htole64(XHCI_TRB_0_WLENGTH_MASK)) {
 				if (td->td_trb[0].qwTrb0 &
 				    htole64(XHCI_TRB_0_DIR_IN_MASK))
 					dword |= XHCI_TRB_3_TRT_IN;
 				else
 					dword |= XHCI_TRB_3_TRT_OUT;
 			}
 
 			td->td_trb[0].dwTrb3 = htole32(dword);
 #ifdef USB_DEBUG
 			xhci_dump_trb(&td->td_trb[x]);
 #endif
 			x++;
 
 		} else do {
 
 			uint32_t npkt;
 
 			/* fill out buffer pointers */
 
 			if (average == 0) {
 				memset(&buf_res, 0, sizeof(buf_res));
 			} else {
 				usbd_get_page(temp->pc, temp->offset +
 				    buf_offset, &buf_res);
 
 				/* get length to end of page */
 				if (buf_res.length > average)
 					buf_res.length = average;
 
 				/* check for maximum length */
 				if (buf_res.length > XHCI_TD_PAGE_SIZE)
 					buf_res.length = XHCI_TD_PAGE_SIZE;
 
 				npkt_off += buf_res.length;
 			}
 
 			/* set up npkt */
 			npkt = howmany(len_old - npkt_off,
 				       temp->max_packet_size);
 
 			if (npkt == 0)
 				npkt = 1;
 			else if (npkt > 31)
 				npkt = 31;
 
 			/* fill out TRB's */
 			td->td_trb[x].qwTrb0 =
 			    htole64((uint64_t)buf_res.physaddr);
 
 			dword =
 			  XHCI_TRB_2_BYTES_SET(buf_res.length) |
 			  XHCI_TRB_2_TDSZ_SET(npkt) | 
 			  XHCI_TRB_2_IRQ_SET(0);
 
 			td->td_trb[x].dwTrb2 = htole32(dword);
 
 			switch (temp->trb_type) {
 			case XHCI_TRB_TYPE_ISOCH:
 				dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT |
 				    XHCI_TRB_3_TBC_SET(temp->tbc) |
 				    XHCI_TRB_3_TLBPC_SET(temp->tlbpc);
 				if (td != td_first) {
 					dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NORMAL);
 				} else if (temp->do_isoc_sync != 0) {
 					temp->do_isoc_sync = 0;
 					/* wait until "isoc_frame" */
 					dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ISOCH) |
 					    XHCI_TRB_3_FRID_SET(temp->isoc_frame / 8);
 				} else {
 					/* start data transfer at next interval */
 					dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ISOCH) |
 					    XHCI_TRB_3_ISO_SIA_BIT;
 				}
 				if (temp->direction == UE_DIR_IN)
 					dword |= XHCI_TRB_3_ISP_BIT;
 				break;
 			case XHCI_TRB_TYPE_DATA_STAGE:
 				dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT |
 				    XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_DATA_STAGE);
 				if (temp->direction == UE_DIR_IN)
 					dword |= XHCI_TRB_3_DIR_IN | XHCI_TRB_3_ISP_BIT;
 				/*
 				 * Section 3.2.9 in the XHCI
 				 * specification about control
 				 * transfers says that we should use a
 				 * normal-TRB if there are more TRBs
 				 * extending the data-stage
 				 * TRB. Update the "trb_type".
 				 */
 				temp->trb_type = XHCI_TRB_TYPE_NORMAL;
 				break;
 			case XHCI_TRB_TYPE_STATUS_STAGE:
 				dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT |
 				    XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_STATUS_STAGE);
 				if (temp->direction == UE_DIR_IN)
 					dword |= XHCI_TRB_3_DIR_IN;
 				break;
 			default:	/* XHCI_TRB_TYPE_NORMAL */
 				dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT |
 				    XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NORMAL);
 				if (temp->direction == UE_DIR_IN)
 					dword |= XHCI_TRB_3_ISP_BIT;
 				break;
 			}
 			td->td_trb[x].dwTrb3 = htole32(dword);
 
 			average -= buf_res.length;
 			buf_offset += buf_res.length;
 #ifdef USB_DEBUG
 			xhci_dump_trb(&td->td_trb[x]);
 #endif
 			x++;
 
 		} while (average != 0);
 
 		td->td_trb[x-1].dwTrb3 |= htole32(XHCI_TRB_3_IOC_BIT);
 
 		/* store number of data TRB's */
 
 		td->ntrb = x;
 
 		DPRINTF("NTRB=%u\n", x);
 
 		/* fill out link TRB */
 
 		if (td_next != NULL) {
 			/* link the current TD with the next one */
 			td->td_trb[x].qwTrb0 = htole64((uint64_t)td_next->td_self);
 			DPRINTF("LINK=0x%08llx\n", (long long)td_next->td_self);
 		} else {
 			/* this field will get updated later */
 			DPRINTF("NOLINK\n");
 		}
 
 		dword = XHCI_TRB_2_IRQ_SET(0);
 
 		td->td_trb[x].dwTrb2 = htole32(dword);
 
 		dword = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK) |
 		    XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_IOC_BIT |
 		    /*
 		     * CHAIN-BIT: Ensure that a multi-TRB IN-endpoint
 		     * frame only receives a single short packet event
 		     * by setting the CHAIN bit in the LINK field. In
 		     * addition some XHCI controllers have problems
 		     * sending a ZLP unless the CHAIN-BIT is set in
 		     * the LINK TRB.
 		     */
 		    XHCI_TRB_3_CHAIN_BIT;
 
 		td->td_trb[x].dwTrb3 = htole32(dword);
 
 		td->alt_next = td_alt_next;
 #ifdef USB_DEBUG
 		xhci_dump_trb(&td->td_trb[x]);
 #endif
 		usb_pc_cpu_flush(td->page_cache);
 	}
 
 	if (precompute) {
 		precompute = 0;
 
 		/* set up alt next pointer, if any */
 		if (temp->last_frame) {
 			td_alt_next = NULL;
 		} else {
 			/* we use this field internally */
 			td_alt_next = td_next;
 		}
 
 		/* restore */
 		temp->shortpkt = shortpkt_old;
 		temp->len = len_old;
 		goto restart;
 	}
 
 	/*
 	 * Remove cycle bit from the first TRB if we are
 	 * stepping them:
 	 */
 	if (temp->step_td != 0) {
 		td_first->td_trb[0].dwTrb3 &= ~htole32(XHCI_TRB_3_CYCLE_BIT);
 		usb_pc_cpu_flush(td_first->page_cache);
 	}
 
 	/* clear TD SIZE to zero, hence this is the last TRB */
 	/* remove chain bit because this is the last data TRB in the chain */
-	td->td_trb[td->ntrb - 1].dwTrb2 &= ~htole32(XHCI_TRB_2_TDSZ_SET(15));
+	td->td_trb[td->ntrb - 1].dwTrb2 &= ~htole32(XHCI_TRB_2_TDSZ_SET(31));
 	td->td_trb[td->ntrb - 1].dwTrb3 &= ~htole32(XHCI_TRB_3_CHAIN_BIT);
 	/* remove CHAIN-BIT from last LINK TRB */
 	td->td_trb[td->ntrb].dwTrb3 &= ~htole32(XHCI_TRB_3_CHAIN_BIT);
 
 	usb_pc_cpu_flush(td->page_cache);
 
 	temp->td = td;
 	temp->td_next = td_next;
 }
 
 static void
 xhci_setup_generic_chain(struct usb_xfer *xfer)
 {
 	struct xhci_std_temp temp;
 	struct xhci_td *td;
 	uint32_t x;
 	uint32_t y;
 	uint8_t mult;
 
 	temp.do_isoc_sync = 0;
 	temp.step_td = 0;
 	temp.tbc = 0;
 	temp.tlbpc = 0;
 	temp.average = xfer->max_hc_frame_size;
 	temp.max_packet_size = xfer->max_packet_size;
 	temp.sc = XHCI_BUS2SC(xfer->xroot->bus);
 	temp.pc = NULL;
 	temp.last_frame = 0;
 	temp.offset = 0;
 	temp.multishort = xfer->flags_int.isochronous_xfr ||
 	    xfer->flags_int.control_xfr ||
 	    xfer->flags_int.short_frames_ok;
 
 	/* toggle the DMA set we are using */
 	xfer->flags_int.curr_dma_set ^= 1;
 
 	/* get next DMA set */
 	td = xfer->td_start[xfer->flags_int.curr_dma_set];
 
 	temp.td = NULL;
 	temp.td_next = td;
 
 	xfer->td_transfer_first = td;
 	xfer->td_transfer_cache = td;
 
 	if (xfer->flags_int.isochronous_xfr) {
 		uint8_t shift;
 
 		/* compute multiplier for ISOCHRONOUS transfers */
 		mult = xfer->endpoint->ecomp ?
 		    UE_GET_SS_ISO_MULT(xfer->endpoint->ecomp->bmAttributes)
 		    : 0;
 		/* check for USB 2.0 multiplier */
 		if (mult == 0) {
 			mult = (xfer->endpoint->edesc->
 			    wMaxPacketSize[1] >> 3) & 3;
 		}
 		/* range check */
 		if (mult > 2)
 			mult = 3;
 		else
 			mult++;
 
 		x = XREAD4(temp.sc, runt, XHCI_MFINDEX);
 
 		DPRINTF("MFINDEX=0x%08x\n", x);
 
 		switch (usbd_get_speed(xfer->xroot->udev)) {
 		case USB_SPEED_FULL:
 			shift = 3;
 			temp.isoc_delta = 8;	/* 1ms */
 			x += temp.isoc_delta - 1;
 			x &= ~(temp.isoc_delta - 1);
 			break;
 		default:
 			shift = usbd_xfer_get_fps_shift(xfer);
 			temp.isoc_delta = 1U << shift;
 			x += temp.isoc_delta - 1;
 			x &= ~(temp.isoc_delta - 1);
 			/* simple frame load balancing */
 			x += xfer->endpoint->usb_uframe;
 			break;
 		}
 
 		y = XHCI_MFINDEX_GET(x - xfer->endpoint->isoc_next);
 
 		if ((xfer->endpoint->is_synced == 0) ||
 		    (y < (xfer->nframes << shift)) ||
 		    (XHCI_MFINDEX_GET(-y) >= (128 * 8))) {
 			/*
 			 * If there is data underflow or the pipe
 			 * queue is empty we schedule the transfer a
 			 * few frames ahead of the current frame
 			 * position. Else two isochronous transfers
 			 * might overlap.
 			 */
 			xfer->endpoint->isoc_next = XHCI_MFINDEX_GET(x + (3 * 8));
 			xfer->endpoint->is_synced = 1;
 			temp.do_isoc_sync = 1;
 
 			DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
 		}
 
 		/* compute isochronous completion time */
 
 		y = XHCI_MFINDEX_GET(xfer->endpoint->isoc_next - (x & ~7));
 
 		xfer->isoc_time_complete =
 		    usb_isoc_time_expand(&temp.sc->sc_bus, x / 8) +
 		    (y / 8) + (((xfer->nframes << shift) + 7) / 8);
 
 		x = 0;
 		temp.isoc_frame = xfer->endpoint->isoc_next;
 		temp.trb_type = XHCI_TRB_TYPE_ISOCH;
 
 		xfer->endpoint->isoc_next += xfer->nframes << shift;
 
 	} else if (xfer->flags_int.control_xfr) {
 
 		/* check if we should prepend a setup message */
 
 		if (xfer->flags_int.control_hdr) {
 
 			temp.len = xfer->frlengths[0];
 			temp.pc = xfer->frbuffers + 0;
 			temp.shortpkt = temp.len ? 1 : 0;
 			temp.trb_type = XHCI_TRB_TYPE_SETUP_STAGE;
 			temp.direction = 0;
 
 			/* check for last frame */
 			if (xfer->nframes == 1) {
 				/* no STATUS stage yet, SETUP is last */
 				if (xfer->flags_int.control_act)
 					temp.last_frame = 1;
 			}
 
 			xhci_setup_generic_chain_sub(&temp);
 		}
 		x = 1;
 		mult = 1;
 		temp.isoc_delta = 0;
 		temp.isoc_frame = 0;
 		temp.trb_type = xfer->flags_int.control_did_data ?
 		    XHCI_TRB_TYPE_NORMAL : XHCI_TRB_TYPE_DATA_STAGE;
 	} else {
 		x = 0;
 		mult = 1;
 		temp.isoc_delta = 0;
 		temp.isoc_frame = 0;
 		temp.trb_type = XHCI_TRB_TYPE_NORMAL;
 	}
 
 	if (x != xfer->nframes) {
                 /* set up page_cache pointer */
                 temp.pc = xfer->frbuffers + x;
 		/* set endpoint direction */
 		temp.direction = UE_GET_DIR(xfer->endpointno);
 	}
 
 	while (x != xfer->nframes) {
 
 		/* DATA0 / DATA1 message */
 
 		temp.len = xfer->frlengths[x];
 		temp.step_td = ((xfer->endpointno & UE_DIR_IN) &&
 		    x != 0 && temp.multishort == 0);
 
 		x++;
 
 		if (x == xfer->nframes) {
 			if (xfer->flags_int.control_xfr) {
 				/* no STATUS stage yet, DATA is last */
 				if (xfer->flags_int.control_act)
 					temp.last_frame = 1;
 			} else {
 				temp.last_frame = 1;
 			}
 		}
 		if (temp.len == 0) {
 
 			/* make sure that we send an USB packet */
 
 			temp.shortpkt = 0;
 
 			temp.tbc = 0;
 			temp.tlbpc = mult - 1;
 
 		} else if (xfer->flags_int.isochronous_xfr) {
 
 			uint8_t tdpc;
 
 			/*
 			 * Isochronous transfers don't have short
 			 * packet termination:
 			 */
 
 			temp.shortpkt = 1;
 
 			/* isochronous transfers have a transfer limit */
 
 			if (temp.len > xfer->max_frame_size)
 				temp.len = xfer->max_frame_size;
 
 			/* compute TD packet count */
 			tdpc = howmany(temp.len, xfer->max_packet_size);
 
 			temp.tbc = howmany(tdpc, mult) - 1;
 			temp.tlbpc = (tdpc % mult);
 
 			if (temp.tlbpc == 0)
 				temp.tlbpc = mult - 1;
 			else
 				temp.tlbpc--;
 		} else {
 
 			/* regular data transfer */
 
 			temp.shortpkt = xfer->flags.force_short_xfer ? 0 : 1;
 		}
 
 		xhci_setup_generic_chain_sub(&temp);
 
 		if (xfer->flags_int.isochronous_xfr) {
 			temp.offset += xfer->frlengths[x - 1];
 			temp.isoc_frame += temp.isoc_delta;
 		} else {
 			/* get next Page Cache pointer */
 			temp.pc = xfer->frbuffers + x;
 		}
 	}
 
 	/* check if we should append a status stage */
 
 	if (xfer->flags_int.control_xfr &&
 	    !xfer->flags_int.control_act) {
 
 		/*
 		 * Send a DATA1 message and invert the current
 		 * endpoint direction.
 		 */
 		if (xhcictlstep || temp.sc->sc_ctlstep) {
 			/*
 			 * Some XHCI controllers will not delay the
 			 * status stage until the next SOF. Force this
 			 * behaviour to avoid failed control
 			 * transfers.
 			 */
 			temp.step_td = (xfer->nframes != 0);
 		} else {
 			temp.step_td = 0;
 		}
 		temp.direction = UE_GET_DIR(xfer->endpointno) ^ UE_DIR_IN;
 		temp.len = 0;
 		temp.pc = NULL;
 		temp.shortpkt = 0;
 		temp.last_frame = 1;
 		temp.trb_type = XHCI_TRB_TYPE_STATUS_STAGE;
 
 		xhci_setup_generic_chain_sub(&temp);
 	}
 
 	td = temp.td;
 
 	/* must have at least one frame! */
 
 	xfer->td_transfer_last = td;
 
 	DPRINTF("first=%p last=%p\n", xfer->td_transfer_first, td);
 }
 
 static void
 xhci_set_slot_pointer(struct xhci_softc *sc, uint8_t index, uint64_t dev_addr)
 {
 	struct usb_page_search buf_res;
 	struct xhci_dev_ctx_addr *pdctxa;
 
 	usbd_get_page(&sc->sc_hw.ctx_pc, 0, &buf_res);
 
 	pdctxa = buf_res.buffer;
 
 	DPRINTF("addr[%u]=0x%016llx\n", index, (long long)dev_addr);
 
 	pdctxa->qwBaaDevCtxAddr[index] = htole64(dev_addr);
 
 	usb_pc_cpu_flush(&sc->sc_hw.ctx_pc);
 }
 
 static usb_error_t
 xhci_configure_mask(struct usb_device *udev, uint32_t mask, uint8_t drop)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	struct usb_page_search buf_inp;
 	struct xhci_input_dev_ctx *pinp;
 	uint32_t temp;
 	uint8_t index;
 	uint8_t x;
 
 	index = udev->controller_slot_id;
 
 	usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);
 
 	pinp = buf_inp.buffer;
 
 	if (drop) {
 		mask &= XHCI_INCTX_NON_CTRL_MASK;
 		xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx0, mask);
 		xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx1, 0);
 	} else {
 		/*
 		 * Some hardware requires that we drop the endpoint
 		 * context before adding it again:
 		 */
 		xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx0,
 		    mask & XHCI_INCTX_NON_CTRL_MASK);
 
 		/* Add new endpoint context */
 		xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx1, mask);
 
 		/* find most significant set bit */
 		for (x = 31; x != 1; x--) {
 			if (mask & (1 << x))
 				break;
 		}
 
 		/* adjust */
 		x--;
 
 		/* figure out the maximum number of contexts */
 		if (x > sc->sc_hw.devs[index].context_num)
 			sc->sc_hw.devs[index].context_num = x;
 		else
 			x = sc->sc_hw.devs[index].context_num;
 
 		/* update number of contexts */
 		temp = xhci_ctx_get_le32(sc, &pinp->ctx_slot.dwSctx0);
 		temp &= ~XHCI_SCTX_0_CTX_NUM_SET(31);
 		temp |= XHCI_SCTX_0_CTX_NUM_SET(x + 1);
 		xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx0, temp);
 	}
 	usb_pc_cpu_flush(&sc->sc_hw.devs[index].input_pc);
 	return (0);
 }
 
 static usb_error_t
 xhci_configure_endpoint(struct usb_device *udev,
     struct usb_endpoint_descriptor *edesc, struct xhci_endpoint_ext *pepext,
     uint16_t interval, uint8_t max_packet_count,
     uint8_t mult, uint8_t fps_shift, uint16_t max_packet_size,
     uint16_t max_frame_size, uint8_t ep_mode)
 {
 	struct usb_page_search buf_inp;
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	struct xhci_input_dev_ctx *pinp;
 	uint64_t ring_addr = pepext->physaddr;
 	uint32_t temp;
 	uint8_t index;
 	uint8_t epno;
 	uint8_t type;
 
 	index = udev->controller_slot_id;
 
 	usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);
 
 	pinp = buf_inp.buffer;
 
 	epno = edesc->bEndpointAddress;
 	type = edesc->bmAttributes & UE_XFERTYPE;
 
 	if (type == UE_CONTROL)
 		epno |= UE_DIR_IN;
 
 	epno = XHCI_EPNO2EPID(epno);
 
  	if (epno == 0)
 		return (USB_ERR_NO_PIPE);		/* invalid */
 
 	if (max_packet_count == 0)
 		return (USB_ERR_BAD_BUFSIZE);
 
 	max_packet_count--;
 
 	if (mult == 0)
 		return (USB_ERR_BAD_BUFSIZE);
 
 	/* store endpoint mode */
 	pepext->trb_ep_mode = ep_mode;
 	/* store bMaxPacketSize for control endpoints */
 	pepext->trb_ep_maxp = edesc->wMaxPacketSize[0];
 	usb_pc_cpu_flush(pepext->page_cache);
 
 	if (ep_mode == USB_EP_MODE_STREAMS) {
 		temp = XHCI_EPCTX_0_EPSTATE_SET(0) |
 		    XHCI_EPCTX_0_MAXP_STREAMS_SET(XHCI_MAX_STREAMS_LOG - 1) |
 		    XHCI_EPCTX_0_LSA_SET(1);
 
 		ring_addr += sizeof(struct xhci_trb) *
 		    XHCI_MAX_TRANSFERS * XHCI_MAX_STREAMS;
 	} else {
 		temp = XHCI_EPCTX_0_EPSTATE_SET(0) |
 		    XHCI_EPCTX_0_MAXP_STREAMS_SET(0) |
 		    XHCI_EPCTX_0_LSA_SET(0);
 
 		ring_addr |= XHCI_EPCTX_2_DCS_SET(1);
 	}
 
 	switch (udev->speed) {
 	case USB_SPEED_FULL:
 	case USB_SPEED_LOW:
 		/* 1ms -> 125us */
 		fps_shift += 3;
 		break;
 	default:
 		break;
 	}
 
 	switch (type) {
 	case UE_INTERRUPT:
 		if (fps_shift > 3)
 			fps_shift--;
 		temp |= XHCI_EPCTX_0_IVAL_SET(fps_shift);
 		break;
 	case UE_ISOCHRONOUS:
 		temp |= XHCI_EPCTX_0_IVAL_SET(fps_shift);
 
 		switch (udev->speed) {
 		case USB_SPEED_SUPER:
 			if (mult > 3)
 				mult = 3;
 			temp |= XHCI_EPCTX_0_MULT_SET(mult - 1);
 			max_packet_count /= mult;
 			break;
 		default:
 			break;
 		}
 		break;
 	default:
 		break;
 	}
 
 	xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx0, temp);
 
 	temp =
 	    XHCI_EPCTX_1_HID_SET(0) |
 	    XHCI_EPCTX_1_MAXB_SET(max_packet_count) |
 	    XHCI_EPCTX_1_MAXP_SIZE_SET(max_packet_size);
 
 	/*
 	 * Always enable the "three strikes and you are gone" feature
 	 * except for ISOCHRONOUS endpoints. This is suggested by
 	 * section 4.3.3 in the XHCI specification about device slot
 	 * initialisation.
 	 */
 	if (type != UE_ISOCHRONOUS)
 		temp |= XHCI_EPCTX_1_CERR_SET(3);
 
 	switch (type) {
 	case UE_CONTROL:
 		temp |= XHCI_EPCTX_1_EPTYPE_SET(4);
 		break;
 	case UE_ISOCHRONOUS:
 		temp |= XHCI_EPCTX_1_EPTYPE_SET(1);
 		break;
 	case UE_BULK:
 		temp |= XHCI_EPCTX_1_EPTYPE_SET(2);
 		break;
 	default:
 		temp |= XHCI_EPCTX_1_EPTYPE_SET(3);
 		break;
 	}
 
 	/* check for IN direction */
 	if (epno & 1)
 		temp |= XHCI_EPCTX_1_EPTYPE_SET(4);
 
 	xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx1, temp);
 	xhci_ctx_set_le64(sc, &pinp->ctx_ep[epno - 1].qwEpCtx2, ring_addr);
 
 	switch (edesc->bmAttributes & UE_XFERTYPE) {
 	case UE_INTERRUPT:
 	case UE_ISOCHRONOUS:
 		temp = XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(max_frame_size) |
 		    XHCI_EPCTX_4_AVG_TRB_LEN_SET(MIN(XHCI_PAGE_SIZE,
 		    max_frame_size));
 		break;
 	case UE_CONTROL:
 		temp = XHCI_EPCTX_4_AVG_TRB_LEN_SET(8);
 		break;
 	default:
 		temp = XHCI_EPCTX_4_AVG_TRB_LEN_SET(XHCI_PAGE_SIZE);
 		break;
 	}
 
 	xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx4, temp);
 
 #ifdef USB_DEBUG
 	xhci_dump_endpoint(sc, &pinp->ctx_ep[epno - 1]);
 #endif
 	usb_pc_cpu_flush(&sc->sc_hw.devs[index].input_pc);
 
 	return (0);		/* success */
 }
 
 static usb_error_t
 xhci_configure_endpoint_by_xfer(struct usb_xfer *xfer)
 {
 	struct xhci_endpoint_ext *pepext;
 	struct usb_endpoint_ss_comp_descriptor *ecomp;
 	usb_stream_t x;
 
 	pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
 	    xfer->endpoint->edesc);
 
 	ecomp = xfer->endpoint->ecomp;
 
 	for (x = 0; x != XHCI_MAX_STREAMS; x++) {
 		uint64_t temp;
 
 		/* halt any transfers */
 		pepext->trb[x * XHCI_MAX_TRANSFERS].dwTrb3 = 0;
 
 		/* compute start of TRB ring for stream "x" */
 		temp = pepext->physaddr +
 		    (x * XHCI_MAX_TRANSFERS * sizeof(struct xhci_trb)) +
 		    XHCI_SCTX_0_SCT_SEC_TR_RING;
 
 		/* make tree structure */
 		pepext->trb[(XHCI_MAX_TRANSFERS *
 		    XHCI_MAX_STREAMS) + x].qwTrb0 = htole64(temp);
 
 		/* reserved fields */
 		pepext->trb[(XHCI_MAX_TRANSFERS *
                     XHCI_MAX_STREAMS) + x].dwTrb2 = 0;
 		pepext->trb[(XHCI_MAX_TRANSFERS *
 		    XHCI_MAX_STREAMS) + x].dwTrb3 = 0;
 	}
 	usb_pc_cpu_flush(pepext->page_cache);
 
 	return (xhci_configure_endpoint(xfer->xroot->udev,
 	    xfer->endpoint->edesc, pepext,
 	    xfer->interval, xfer->max_packet_count,
 	    (ecomp != NULL) ? UE_GET_SS_ISO_MULT(ecomp->bmAttributes) + 1 : 1,
 	    usbd_xfer_get_fps_shift(xfer), xfer->max_packet_size,
 	    xfer->max_frame_size, xfer->endpoint->ep_mode));
 }
 
 static usb_error_t
 xhci_configure_device(struct usb_device *udev)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	struct usb_page_search buf_inp;
 	struct usb_page_cache *pcinp;
 	struct xhci_input_dev_ctx *pinp;
 	struct usb_device *hubdev;
 	uint32_t temp;
 	uint32_t route;
 	uint32_t rh_port;
 	uint8_t is_hub;
 	uint8_t index;
 	uint8_t depth;
 
 	index = udev->controller_slot_id;
 
 	DPRINTF("index=%u\n", index);
 
 	pcinp = &sc->sc_hw.devs[index].input_pc;
 
 	usbd_get_page(pcinp, 0, &buf_inp);
 
 	pinp = buf_inp.buffer;
 
 	rh_port = 0;
 	route = 0;
 
 	/* figure out route string and root HUB port number */
 
 	for (hubdev = udev; hubdev != NULL; hubdev = hubdev->parent_hub) {
 
 		if (hubdev->parent_hub == NULL)
 			break;
 
 		depth = hubdev->parent_hub->depth;
 
 		/*
 		 * NOTE: HS/FS/LS devices and the SS root HUB can have
 		 * more than 15 ports
 		 */
 
 		rh_port = hubdev->port_no;
 
 		if (depth == 0)
 			break;
 
 		if (rh_port > 15)
 			rh_port = 15;
 
 		if (depth < 6)
 			route |= rh_port << (4 * (depth - 1));
 	}
 
 	DPRINTF("Route=0x%08x\n", route);
 
 	temp = XHCI_SCTX_0_ROUTE_SET(route) |
 	    XHCI_SCTX_0_CTX_NUM_SET(
 	    sc->sc_hw.devs[index].context_num + 1);
 
 	switch (udev->speed) {
 	case USB_SPEED_LOW:
 		temp |= XHCI_SCTX_0_SPEED_SET(2);
 		if (udev->parent_hs_hub != NULL &&
 		    udev->parent_hs_hub->ddesc.bDeviceProtocol ==
 		    UDPROTO_HSHUBMTT) {
 			DPRINTF("Device inherits MTT\n");
 			temp |= XHCI_SCTX_0_MTT_SET(1);
 		}
 		break;
 	case USB_SPEED_HIGH:
 		temp |= XHCI_SCTX_0_SPEED_SET(3);
 		if (sc->sc_hw.devs[index].nports != 0 &&
 		    udev->ddesc.bDeviceProtocol == UDPROTO_HSHUBMTT) {
 			DPRINTF("HUB supports MTT\n");
 			temp |= XHCI_SCTX_0_MTT_SET(1);
 		}
 		break;
 	case USB_SPEED_FULL:
 		temp |= XHCI_SCTX_0_SPEED_SET(1);
 		if (udev->parent_hs_hub != NULL &&
 		    udev->parent_hs_hub->ddesc.bDeviceProtocol ==
 		    UDPROTO_HSHUBMTT) {
 			DPRINTF("Device inherits MTT\n");
 			temp |= XHCI_SCTX_0_MTT_SET(1);
 		}
 		break;
 	default:
 		temp |= XHCI_SCTX_0_SPEED_SET(4);
 		break;
 	}
 
 	is_hub = sc->sc_hw.devs[index].nports != 0 &&
 	    (udev->speed == USB_SPEED_SUPER ||
 	    udev->speed == USB_SPEED_HIGH);
 
 	if (is_hub)
 		temp |= XHCI_SCTX_0_HUB_SET(1);
 
 	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx0, temp);
 
 	temp = XHCI_SCTX_1_RH_PORT_SET(rh_port);
 
 	if (is_hub) {
 		temp |= XHCI_SCTX_1_NUM_PORTS_SET(
 		    sc->sc_hw.devs[index].nports);
 	}
 
 	switch (udev->speed) {
 	case USB_SPEED_SUPER:
 		switch (sc->sc_hw.devs[index].state) {
 		case XHCI_ST_ADDRESSED:
 		case XHCI_ST_CONFIGURED:
 			/* enable power save */
 			temp |= XHCI_SCTX_1_MAX_EL_SET(sc->sc_exit_lat_max);
 			break;
 		default:
 			/* disable power save */
 			break;
 		}
 		break;
 	default:
 		break;
 	}
 
 	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx1, temp);
 
 	temp = XHCI_SCTX_2_IRQ_TARGET_SET(0);
 
 	if (is_hub) {
 		temp |= XHCI_SCTX_2_TT_THINK_TIME_SET(
 		    sc->sc_hw.devs[index].tt);
 	}
 
 	hubdev = udev->parent_hs_hub;
 
 	/* check if we should activate the transaction translator */
 	switch (udev->speed) {
 	case USB_SPEED_FULL:
 	case USB_SPEED_LOW:
 		if (hubdev != NULL) {
 			temp |= XHCI_SCTX_2_TT_HUB_SID_SET(
 			    hubdev->controller_slot_id);
 			temp |= XHCI_SCTX_2_TT_PORT_NUM_SET(
 			    udev->hs_port_no);
 		}
 		break;
 	default:
 		break;
 	}
 
 	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx2, temp);
 
 	/*
 	 * These fields should be initialized to zero, according to
 	 * XHCI section 6.2.2 - slot context:
 	 */
 	temp = XHCI_SCTX_3_DEV_ADDR_SET(0) |
 	    XHCI_SCTX_3_SLOT_STATE_SET(0);
 
 	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx3, temp);
 
 #ifdef USB_DEBUG
 	xhci_dump_device(sc, &pinp->ctx_slot);
 #endif
 	usb_pc_cpu_flush(pcinp);
 
 	return (0);		/* success */
 }
 
 static usb_error_t
 xhci_alloc_device_ext(struct usb_device *udev)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	struct usb_page_search buf_dev;
 	struct usb_page_search buf_ep;
 	struct xhci_trb *trb;
 	struct usb_page_cache *pc;
 	struct usb_page *pg;
 	uint64_t addr;
 	uint8_t index;
 	uint8_t i;
 
 	index = udev->controller_slot_id;
 
 	pc = &sc->sc_hw.devs[index].device_pc;
 	pg = &sc->sc_hw.devs[index].device_pg;
 
 	/* need to initialize the page cache */
 	pc->tag_parent = sc->sc_bus.dma_parent_tag;
 
 	if (usb_pc_alloc_mem(pc, pg, sc->sc_ctx_is_64_byte ?
 	    (2 * sizeof(struct xhci_dev_ctx)) :
 	    sizeof(struct xhci_dev_ctx), XHCI_PAGE_SIZE))
 		goto error;
 
 	usbd_get_page(pc, 0, &buf_dev);
 
 	pc = &sc->sc_hw.devs[index].input_pc;
 	pg = &sc->sc_hw.devs[index].input_pg;
 
 	/* need to initialize the page cache */
 	pc->tag_parent = sc->sc_bus.dma_parent_tag;
 
 	if (usb_pc_alloc_mem(pc, pg, sc->sc_ctx_is_64_byte ?
 	    (2 * sizeof(struct xhci_input_dev_ctx)) :
 	    sizeof(struct xhci_input_dev_ctx), XHCI_PAGE_SIZE)) {
 		goto error;
 	}
 
 	/* initialize all endpoint LINK TRBs */
 
 	for (i = 0; i != XHCI_MAX_ENDPOINTS; i++) {
 
 		pc = &sc->sc_hw.devs[index].endpoint_pc[i];
 		pg = &sc->sc_hw.devs[index].endpoint_pg[i];
 
 		/* need to initialize the page cache */
 		pc->tag_parent = sc->sc_bus.dma_parent_tag;
 
 		if (usb_pc_alloc_mem(pc, pg,
 		    sizeof(struct xhci_dev_endpoint_trbs), XHCI_TRB_ALIGN)) {
 			goto error;
 		}
 
 		/* lookup endpoint TRB ring */
 		usbd_get_page(pc, 0, &buf_ep);
 
 		/* get TRB pointer */
 		trb = buf_ep.buffer;
 		trb += XHCI_MAX_TRANSFERS - 1;
 
 		/* get TRB start address */
 		addr = buf_ep.physaddr;
 
 		/* create LINK TRB */
 		trb->qwTrb0 = htole64(addr);
 		trb->dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));
 		trb->dwTrb3 = htole32(XHCI_TRB_3_CYCLE_BIT |
 		    XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
 
 		usb_pc_cpu_flush(pc);
 	}
 
 	xhci_set_slot_pointer(sc, index, buf_dev.physaddr);
 
 	return (0);
 
 error:
 	xhci_free_device_ext(udev);
 
 	return (USB_ERR_NOMEM);
 }
 
 static void
 xhci_free_device_ext(struct usb_device *udev)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	uint8_t index;
 	uint8_t i;
 
 	index = udev->controller_slot_id;
 	xhci_set_slot_pointer(sc, index, 0);
 
 	usb_pc_free_mem(&sc->sc_hw.devs[index].device_pc);
 	usb_pc_free_mem(&sc->sc_hw.devs[index].input_pc);
 	for (i = 0; i != XHCI_MAX_ENDPOINTS; i++)
 		usb_pc_free_mem(&sc->sc_hw.devs[index].endpoint_pc[i]);
 }
 
 static struct xhci_endpoint_ext *
 xhci_get_endpoint_ext(struct usb_device *udev, struct usb_endpoint_descriptor *edesc)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	struct xhci_endpoint_ext *pepext;
 	struct usb_page_cache *pc;
 	struct usb_page_search buf_ep;
 	uint8_t epno;
 	uint8_t index;
 
 	epno = edesc->bEndpointAddress;
 	if ((edesc->bmAttributes & UE_XFERTYPE) == UE_CONTROL)
 		epno |= UE_DIR_IN;
 
 	epno = XHCI_EPNO2EPID(epno);
 
 	index = udev->controller_slot_id;
 
 	pc = &sc->sc_hw.devs[index].endpoint_pc[epno];
 
 	usbd_get_page(pc, 0, &buf_ep);
 
 	pepext = &sc->sc_hw.devs[index].endp[epno];
 	pepext->page_cache = pc;
 	pepext->trb = buf_ep.buffer;
 	pepext->physaddr = buf_ep.physaddr;
 
 	return (pepext);
 }
 
 static void
 xhci_endpoint_doorbell(struct usb_xfer *xfer)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
 	uint8_t epno;
 	uint8_t index;
 
 	epno = xfer->endpointno;
 	if (xfer->flags_int.control_xfr)
 		epno |= UE_DIR_IN;
 
 	epno = XHCI_EPNO2EPID(epno);
 	index = xfer->xroot->udev->controller_slot_id;
 
 	if (xfer->xroot->udev->flags.self_suspended == 0) {
 		XWRITE4(sc, door, XHCI_DOORBELL(index),
 		    epno | XHCI_DB_SID_SET(xfer->stream_id));
 	}
 }
 
 static void
 xhci_transfer_remove(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct xhci_endpoint_ext *pepext;
 
 	if (xfer->flags_int.bandwidth_reclaimed) {
 		xfer->flags_int.bandwidth_reclaimed = 0;
 
 		pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
 		    xfer->endpoint->edesc);
 
 		pepext->trb_used[xfer->stream_id]--;
 
 		pepext->xfer[xfer->qh_pos] = NULL;
 
 		if (error && pepext->trb_running != 0) {
 			pepext->trb_halted = 1;
 			pepext->trb_running = 0;
 		}
 	}
 }
 
 static usb_error_t
 xhci_transfer_insert(struct usb_xfer *xfer)
 {
 	struct xhci_td *td_first;
 	struct xhci_td *td_last;
 	struct xhci_trb *trb_link;
 	struct xhci_endpoint_ext *pepext;
 	uint64_t addr;
 	usb_stream_t id;
 	uint8_t i;
 	uint8_t inext;
 	uint8_t trb_limit;
 
 	DPRINTFN(8, "\n");
 
 	id = xfer->stream_id;
 
 	/* check if already inserted */
 	if (xfer->flags_int.bandwidth_reclaimed) {
 		DPRINTFN(8, "Already in schedule\n");
 		return (0);
 	}
 
 	pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
 	    xfer->endpoint->edesc);
 
 	td_first = xfer->td_transfer_first;
 	td_last = xfer->td_transfer_last;
 	addr = pepext->physaddr;
 
 	switch (xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE) {
 	case UE_CONTROL:
 	case UE_INTERRUPT:
 		/* single buffered */
 		trb_limit = 1;
 		break;
 	default:
 		/* multi buffered */
 		trb_limit = (XHCI_MAX_TRANSFERS - 2);
 		break;
 	}
 
 	if (pepext->trb_used[id] >= trb_limit) {
 		DPRINTFN(8, "Too many TDs queued.\n");
 		return (USB_ERR_NOMEM);
 	}
 
 	/* check if bMaxPacketSize changed */
 	if (xfer->flags_int.control_xfr != 0 &&
 	    pepext->trb_ep_maxp != xfer->endpoint->edesc->wMaxPacketSize[0]) {
 
 		DPRINTFN(8, "Reconfigure control endpoint\n");
 
 		/* force driver to reconfigure endpoint */
 		pepext->trb_halted = 1;
 		pepext->trb_running = 0;
 	}
 
 	/* check for stopped condition, after putting transfer on interrupt queue */
 	if (pepext->trb_running == 0) {
 		struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
 
 		DPRINTFN(8, "Not running\n");
 
 		/* start configuration */
 		(void)usb_proc_msignal(USB_BUS_CONTROL_XFER_PROC(&sc->sc_bus),
 		    &sc->sc_config_msg[0], &sc->sc_config_msg[1]);
 		return (0);
 	}
 
 	pepext->trb_used[id]++;
 
 	/* get current TRB index */
 	i = pepext->trb_index[id];
 
 	/* get next TRB index */
 	inext = (i + 1);
 
 	/* the last entry of the ring is a hardcoded link TRB */
 	if (inext >= (XHCI_MAX_TRANSFERS - 1))
 		inext = 0;
 
 	/* store next TRB index, before stream ID offset is added */
 	pepext->trb_index[id] = inext;
 
 	/* offset for stream */
 	i += id * XHCI_MAX_TRANSFERS;
 	inext += id * XHCI_MAX_TRANSFERS;
 
 	/* compute terminating return address */
 	addr += (inext * sizeof(struct xhci_trb));
 
 	/* compute link TRB pointer */
 	trb_link = td_last->td_trb + td_last->ntrb;
 
 	/* update next pointer of last link TRB */
 	trb_link->qwTrb0 = htole64(addr);
 	trb_link->dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));
 	trb_link->dwTrb3 = htole32(XHCI_TRB_3_IOC_BIT |
 	    XHCI_TRB_3_CYCLE_BIT |
 	    XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
 
 #ifdef USB_DEBUG
 	xhci_dump_trb(&td_last->td_trb[td_last->ntrb]);
 #endif
 	usb_pc_cpu_flush(td_last->page_cache);
 
 	/* write ahead chain end marker */
 
 	pepext->trb[inext].qwTrb0 = 0;
 	pepext->trb[inext].dwTrb2 = 0;
 	pepext->trb[inext].dwTrb3 = 0;
 
 	/* update next pointer of link TRB */
 
 	pepext->trb[i].qwTrb0 = htole64((uint64_t)td_first->td_self);
 	pepext->trb[i].dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));
 
 #ifdef USB_DEBUG
 	xhci_dump_trb(&pepext->trb[i]);
 #endif
 	usb_pc_cpu_flush(pepext->page_cache);
 
 	/* toggle cycle bit which activates the transfer chain */
 
 	pepext->trb[i].dwTrb3 = htole32(XHCI_TRB_3_CYCLE_BIT |
 	    XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
 
 	usb_pc_cpu_flush(pepext->page_cache);
 
 	DPRINTF("qh_pos = %u\n", i);
 
 	pepext->xfer[i] = xfer;
 
 	xfer->qh_pos = i;
 
 	xfer->flags_int.bandwidth_reclaimed = 1;
 
 	xhci_endpoint_doorbell(xfer);
 
 	return (0);
 }
 
 static void
 xhci_root_intr(struct xhci_softc *sc)
 {
 	uint16_t i;
 
 	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
 
 	/* clear any old interrupt data */
 	memset(sc->sc_hub_idata, 0, sizeof(sc->sc_hub_idata));
 
 	for (i = 1; i <= sc->sc_noport; i++) {
 		/* pick out CHANGE bits from the status register */
 		if (XREAD4(sc, oper, XHCI_PORTSC(i)) & (
 		    XHCI_PS_CSC | XHCI_PS_PEC |
 		    XHCI_PS_OCC | XHCI_PS_WRC |
 		    XHCI_PS_PRC | XHCI_PS_PLC |
 		    XHCI_PS_CEC)) {
 			sc->sc_hub_idata[i / 8] |= 1 << (i % 8);
 			DPRINTF("port %d changed\n", i);
 		}
 	}
 	uhub_root_intr(&sc->sc_bus, sc->sc_hub_idata,
 	    sizeof(sc->sc_hub_idata));
 }
 
 /*------------------------------------------------------------------------*
  *	xhci_device_done - XHCI done handler
  *
  * NOTE: This function can be called two times in a row on
  * the same USB transfer. From close and from interrupt.
  *------------------------------------------------------------------------*/
 static void
 xhci_device_done(struct usb_xfer *xfer, usb_error_t error)
 {
 	DPRINTFN(2, "xfer=%p, endpoint=%p, error=%d\n",
 	    xfer, xfer->endpoint, error);
 
 	/* remove transfer from HW queue */
 	xhci_transfer_remove(xfer, error);
 
 	/* dequeue transfer and start next transfer */
 	usbd_transfer_done(xfer, error);
 }
 
 /*------------------------------------------------------------------------*
  * XHCI data transfer support (generic type)
  *------------------------------------------------------------------------*/
 static void
 xhci_device_generic_open(struct usb_xfer *xfer)
 {
 	if (xfer->flags_int.isochronous_xfr) {
 		switch (xfer->xroot->udev->speed) {
 		case USB_SPEED_FULL:
 			break;
 		default:
 			usb_hs_bandwidth_alloc(xfer);
 			break;
 		}
 	}
 }
 
 static void
 xhci_device_generic_close(struct usb_xfer *xfer)
 {
 	DPRINTF("\n");
 
 	xhci_device_done(xfer, USB_ERR_CANCELLED);
 
 	if (xfer->flags_int.isochronous_xfr) {
 		switch (xfer->xroot->udev->speed) {
 		case USB_SPEED_FULL:
 			break;
 		default:
 			usb_hs_bandwidth_free(xfer);
 			break;
 		}
 	}
 }
 
 static void
 xhci_device_generic_multi_enter(struct usb_endpoint *ep,
     usb_stream_t stream_id, struct usb_xfer *enter_xfer)
 {
 	struct usb_xfer *xfer;
 
 	/* check if there is a current transfer */
 	xfer = ep->endpoint_q[stream_id].curr;
 	if (xfer == NULL)
 		return;
 
 	/*
 	 * Check if the current transfer is started and then pickup
 	 * the next one, if any. Else wait for next start event due to
 	 * block on failure feature.
 	 */
 	if (!xfer->flags_int.bandwidth_reclaimed)
 		return;
 
 	xfer = TAILQ_FIRST(&ep->endpoint_q[stream_id].head);
 	if (xfer == NULL) {
 		/*
 		 * In case of enter we have to consider that the
 		 * transfer is queued by the USB core after the enter
 		 * method is called.
 		 */
 		xfer = enter_xfer;
 
 		if (xfer == NULL)
 			return;
 	}
 
 	/* try to multi buffer */
 	xhci_transfer_insert(xfer);
 }
 
 static void
 xhci_device_generic_enter(struct usb_xfer *xfer)
 {
 	DPRINTF("\n");
 
 	/* set up TD's and QH */
 	xhci_setup_generic_chain(xfer);
 
 	xhci_device_generic_multi_enter(xfer->endpoint,
 	    xfer->stream_id, xfer);
 }
 
 static void
 xhci_device_generic_start(struct usb_xfer *xfer)
 {
 	DPRINTF("\n");
 
 	/* try to insert xfer on HW queue */
 	xhci_transfer_insert(xfer);
 
 	/* try to multi buffer */
 	xhci_device_generic_multi_enter(xfer->endpoint,
 	    xfer->stream_id, NULL);
 
 	/* add transfer last on interrupt queue */
 	usbd_transfer_enqueue(&xfer->xroot->bus->intr_q, xfer);
 
 	/* start timeout, if any */
 	if (xfer->timeout != 0)
 		usbd_transfer_timeout_ms(xfer, &xhci_timeout, xfer->timeout);
 }
 
 static const struct usb_pipe_methods xhci_device_generic_methods =
 {
 	.open = xhci_device_generic_open,
 	.close = xhci_device_generic_close,
 	.enter = xhci_device_generic_enter,
 	.start = xhci_device_generic_start,
 };
 
 /*------------------------------------------------------------------------*
  * xhci root HUB support
  *------------------------------------------------------------------------*
  * Simulate a hardware HUB by handling all the necessary requests.
  *------------------------------------------------------------------------*/
 
 #define	HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }
 
 static const
 struct usb_device_descriptor xhci_devd =
 {
 	.bLength = sizeof(xhci_devd),
 	.bDescriptorType = UDESC_DEVICE,	/* type */
 	HSETW(.bcdUSB, 0x0300),			/* USB version */
 	.bDeviceClass = UDCLASS_HUB,		/* class */
 	.bDeviceSubClass = UDSUBCLASS_HUB,	/* subclass */
 	.bDeviceProtocol = UDPROTO_SSHUB,	/* protocol */
 	.bMaxPacketSize = 9,			/* max packet size */
 	HSETW(.idVendor, 0x0000),		/* vendor */
 	HSETW(.idProduct, 0x0000),		/* product */
 	HSETW(.bcdDevice, 0x0100),		/* device version */
 	.iManufacturer = 1,
 	.iProduct = 2,
 	.iSerialNumber = 0,
 	.bNumConfigurations = 1,		/* # of configurations */
 };
 
 static const
 struct xhci_bos_desc xhci_bosd = {
 	.bosd = {
 		.bLength = sizeof(xhci_bosd.bosd),
 		.bDescriptorType = UDESC_BOS,
 		HSETW(.wTotalLength, sizeof(xhci_bosd)),
 		.bNumDeviceCaps = 3,
 	},
 	.usb2extd = {
 		.bLength = sizeof(xhci_bosd.usb2extd),
 		.bDescriptorType = 1,
 		.bDevCapabilityType = 2,
 		.bmAttributes[0] = 2,
 	},
 	.usbdcd = {
 		.bLength = sizeof(xhci_bosd.usbdcd),
 		.bDescriptorType = UDESC_DEVICE_CAPABILITY,
 		.bDevCapabilityType = 3,
 		.bmAttributes = 0, /* XXX */
 		HSETW(.wSpeedsSupported, 0x000C),
 		.bFunctionalitySupport = 8,
 		.bU1DevExitLat = 255,	/* dummy - not used */
 		.wU2DevExitLat = { 0x00, 0x08 },
 	},
 	.cidd = {
 		.bLength = sizeof(xhci_bosd.cidd),
 		.bDescriptorType = 1,
 		.bDevCapabilityType = 4,
 		.bReserved = 0,
 		.bContainerID = 0, /* XXX */
 	},
 };
 
 static const
 struct xhci_config_desc xhci_confd = {
 	.confd = {
 		.bLength = sizeof(xhci_confd.confd),
 		.bDescriptorType = UDESC_CONFIG,
 		.wTotalLength[0] = sizeof(xhci_confd),
 		.bNumInterface = 1,
 		.bConfigurationValue = 1,
 		.iConfiguration = 0,
 		.bmAttributes = UC_SELF_POWERED,
 		.bMaxPower = 0		/* max power */
 	},
 	.ifcd = {
 		.bLength = sizeof(xhci_confd.ifcd),
 		.bDescriptorType = UDESC_INTERFACE,
 		.bNumEndpoints = 1,
 		.bInterfaceClass = UICLASS_HUB,
 		.bInterfaceSubClass = UISUBCLASS_HUB,
 		.bInterfaceProtocol = 0,
 	},
 	.endpd = {
 		.bLength = sizeof(xhci_confd.endpd),
 		.bDescriptorType = UDESC_ENDPOINT,
 		.bEndpointAddress = UE_DIR_IN | XHCI_INTR_ENDPT,
 		.bmAttributes = UE_INTERRUPT,
 		.wMaxPacketSize[0] = 2,		/* max 15 ports */
 		.bInterval = 255,
 	},
 	.endpcd = {
 		.bLength = sizeof(xhci_confd.endpcd),
 		.bDescriptorType = UDESC_ENDPOINT_SS_COMP,
 		.bMaxBurst = 0,
 		.bmAttributes = 0,
 	},
 };
 
 static const
 struct usb_hub_ss_descriptor xhci_hubd = {
 	.bLength = sizeof(xhci_hubd),
 	.bDescriptorType = UDESC_SS_HUB,
 };
 
 static usb_error_t
 xhci_roothub_exec(struct usb_device *udev,
     struct usb_device_request *req, const void **pptr, uint16_t *plength)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	const char *str_ptr;
 	const void *ptr;
 	uint32_t port;
 	uint32_t v;
 	uint16_t len;
 	uint16_t i;
 	uint16_t value;
 	uint16_t index;
 	uint8_t j;
 	usb_error_t err;
 
 	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
 
 	/* buffer reset */
 	ptr = (const void *)&sc->sc_hub_desc;
 	len = 0;
 	err = 0;
 
 	value = UGETW(req->wValue);
 	index = UGETW(req->wIndex);
 
 	DPRINTFN(3, "type=0x%02x request=0x%02x wLen=0x%04x "
 	    "wValue=0x%04x wIndex=0x%04x\n",
 	    req->bmRequestType, req->bRequest,
 	    UGETW(req->wLength), value, index);
 
 #define	C(x,y) ((x) | ((y) << 8))
 	switch (C(req->bRequest, req->bmRequestType)) {
 	case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
 	case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
 	case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
 		/*
 		 * DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops
 		 * for the integrated root hub.
 		 */
 		break;
 	case C(UR_GET_CONFIG, UT_READ_DEVICE):
 		len = 1;
 		sc->sc_hub_desc.temp[0] = sc->sc_conf;
 		break;
 	case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
 		switch (value >> 8) {
 		case UDESC_DEVICE:
 			if ((value & 0xff) != 0) {
 				err = USB_ERR_IOERROR;
 				goto done;
 			}
 			len = sizeof(xhci_devd);
 			ptr = (const void *)&xhci_devd;
 			break;
 
 		case UDESC_BOS:
 			if ((value & 0xff) != 0) {
 				err = USB_ERR_IOERROR;
 				goto done;
 			}
 			len = sizeof(xhci_bosd);
 			ptr = (const void *)&xhci_bosd;
 			break;
 
 		case UDESC_CONFIG:
 			if ((value & 0xff) != 0) {
 				err = USB_ERR_IOERROR;
 				goto done;
 			}
 			len = sizeof(xhci_confd);
 			ptr = (const void *)&xhci_confd;
 			break;
 
 		case UDESC_STRING:
 			switch (value & 0xff) {
 			case 0:	/* Language table */
 				str_ptr = "\001";
 				break;
 
 			case 1:	/* Vendor */
 				str_ptr = sc->sc_vendor;
 				break;
 
 			case 2:	/* Product */
 				str_ptr = "XHCI root HUB";
 				break;
 
 			default:
 				str_ptr = "";
 				break;
 			}
 
 			len = usb_make_str_desc(
 			    sc->sc_hub_desc.temp,
 			    sizeof(sc->sc_hub_desc.temp),
 			    str_ptr);
 			break;
 
 		default:
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		break;
 	case C(UR_GET_INTERFACE, UT_READ_INTERFACE):
 		len = 1;
 		sc->sc_hub_desc.temp[0] = 0;
 		break;
 	case C(UR_GET_STATUS, UT_READ_DEVICE):
 		len = 2;
 		USETW(sc->sc_hub_desc.stat.wStatus, UDS_SELF_POWERED);
 		break;
 	case C(UR_GET_STATUS, UT_READ_INTERFACE):
 	case C(UR_GET_STATUS, UT_READ_ENDPOINT):
 		len = 2;
 		USETW(sc->sc_hub_desc.stat.wStatus, 0);
 		break;
 	case C(UR_SET_ADDRESS, UT_WRITE_DEVICE):
 		if (value >= XHCI_MAX_DEVICES) {
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		break;
 	case C(UR_SET_CONFIG, UT_WRITE_DEVICE):
 		if (value != 0 && value != 1) {
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		sc->sc_conf = value;
 		break;
 	case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
 		break;
 	case C(UR_SET_FEATURE, UT_WRITE_DEVICE):
 	case C(UR_SET_FEATURE, UT_WRITE_INTERFACE):
 	case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
 		err = USB_ERR_IOERROR;
 		goto done;
 	case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
 		break;
 	case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
 		break;
 		/* Hub requests */
 	case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE):
 		break;
 	case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER):
 		DPRINTFN(9, "UR_CLEAR_PORT_FEATURE\n");
 
 		if ((index < 1) ||
 		    (index > sc->sc_noport)) {
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		port = XHCI_PORTSC(index);
 
 		v = XREAD4(sc, oper, port);
 		i = XHCI_PS_PLS_GET(v);
 		v &= ~XHCI_PS_CLEAR;
 
 		switch (value) {
 		case UHF_C_BH_PORT_RESET:
 			XWRITE4(sc, oper, port, v | XHCI_PS_WRC);
 			break;
 		case UHF_C_PORT_CONFIG_ERROR:
 			XWRITE4(sc, oper, port, v | XHCI_PS_CEC);
 			break;
 		case UHF_C_PORT_SUSPEND:
 		case UHF_C_PORT_LINK_STATE:
 			XWRITE4(sc, oper, port, v | XHCI_PS_PLC);
 			break;
 		case UHF_C_PORT_CONNECTION:
 			XWRITE4(sc, oper, port, v | XHCI_PS_CSC);
 			break;
 		case UHF_C_PORT_ENABLE:
 			XWRITE4(sc, oper, port, v | XHCI_PS_PEC);
 			break;
 		case UHF_C_PORT_OVER_CURRENT:
 			XWRITE4(sc, oper, port, v | XHCI_PS_OCC);
 			break;
 		case UHF_C_PORT_RESET:
 			XWRITE4(sc, oper, port, v | XHCI_PS_PRC);
 			break;
 		case UHF_PORT_ENABLE:
 			XWRITE4(sc, oper, port, v | XHCI_PS_PED);
 			break;
 		case UHF_PORT_POWER:
 			XWRITE4(sc, oper, port, v & ~XHCI_PS_PP);
 			break;
 		case UHF_PORT_INDICATOR:
 			XWRITE4(sc, oper, port, v & ~XHCI_PS_PIC_SET(3));
 			break;
 		case UHF_PORT_SUSPEND:
 
 			/* U3 -> U15 */
 			if (i == 3) {
 				XWRITE4(sc, oper, port, v |
 				    XHCI_PS_PLS_SET(0xF) | XHCI_PS_LWS);
 			}
 
 			/* wait 20ms for resume sequence to complete */
 			usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 50);
 
 			/* U0 */
 			XWRITE4(sc, oper, port, v |
 			    XHCI_PS_PLS_SET(0) | XHCI_PS_LWS);
 			break;
 		default:
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		break;
 
 	case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE):
 		if ((value & 0xff) != 0) {
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 
 		v = XREAD4(sc, capa, XHCI_HCSPARAMS0);
 
 		sc->sc_hub_desc.hubd = xhci_hubd;
 
 		sc->sc_hub_desc.hubd.bNbrPorts = sc->sc_noport;
 
 		if (XHCI_HCS0_PPC(v))
 			i = UHD_PWR_INDIVIDUAL;
 		else
 			i = UHD_PWR_GANGED;
 
 		if (XHCI_HCS0_PIND(v))
 			i |= UHD_PORT_IND;
 
 		i |= UHD_OC_INDIVIDUAL;
 
 		USETW(sc->sc_hub_desc.hubd.wHubCharacteristics, i);
 
 		/* see XHCI section 5.4.9: */
 		sc->sc_hub_desc.hubd.bPwrOn2PwrGood = 10;
 
 		for (j = 1; j <= sc->sc_noport; j++) {
 
 			v = XREAD4(sc, oper, XHCI_PORTSC(j));
 			if (v & XHCI_PS_DR) {
 				sc->sc_hub_desc.hubd.
 				    DeviceRemovable[j / 8] |= 1U << (j % 8);
 			}
 		}
 		len = sc->sc_hub_desc.hubd.bLength;
 		break;
 
 	case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE):
 		len = 16;
 		memset(sc->sc_hub_desc.temp, 0, 16);
 		break;
 
 	case C(UR_GET_STATUS, UT_READ_CLASS_OTHER):
 		DPRINTFN(9, "UR_GET_STATUS i=%d\n", index);
 
 		if ((index < 1) ||
 		    (index > sc->sc_noport)) {
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 
 		v = XREAD4(sc, oper, XHCI_PORTSC(index));
 
 		DPRINTFN(9, "port status=0x%08x\n", v);
 
 		i = UPS_PORT_LINK_STATE_SET(XHCI_PS_PLS_GET(v));
 
 		switch (XHCI_PS_SPEED_GET(v)) {
 		case 3:
 			i |= UPS_HIGH_SPEED;
 			break;
 		case 2:
 			i |= UPS_LOW_SPEED;
 			break;
 		case 1:
 			/* FULL speed */
 			break;
 		default:
 			i |= UPS_OTHER_SPEED;
 			break;
 		}
 
 		if (v & XHCI_PS_CCS)
 			i |= UPS_CURRENT_CONNECT_STATUS;
 		if (v & XHCI_PS_PED)
 			i |= UPS_PORT_ENABLED;
 		if (v & XHCI_PS_OCA)
 			i |= UPS_OVERCURRENT_INDICATOR;
 		if (v & XHCI_PS_PR)
 			i |= UPS_RESET;
 		if (v & XHCI_PS_PP) {
 			/*
 			 * The USB 3.0 RH is using the
 			 * USB 2.0's power bit
 			 */
 			i |= UPS_PORT_POWER;
 		}
 		USETW(sc->sc_hub_desc.ps.wPortStatus, i);
 
 		i = 0;
 		if (v & XHCI_PS_CSC)
 			i |= UPS_C_CONNECT_STATUS;
 		if (v & XHCI_PS_PEC)
 			i |= UPS_C_PORT_ENABLED;
 		if (v & XHCI_PS_OCC)
 			i |= UPS_C_OVERCURRENT_INDICATOR;
 		if (v & XHCI_PS_WRC)
 			i |= UPS_C_BH_PORT_RESET;
 		if (v & XHCI_PS_PRC)
 			i |= UPS_C_PORT_RESET;
 		if (v & XHCI_PS_PLC)
 			i |= UPS_C_PORT_LINK_STATE;
 		if (v & XHCI_PS_CEC)
 			i |= UPS_C_PORT_CONFIG_ERROR;
 
 		USETW(sc->sc_hub_desc.ps.wPortChange, i);
 		len = sizeof(sc->sc_hub_desc.ps);
 		break;
 
 	case C(UR_SET_DESCRIPTOR, UT_WRITE_CLASS_DEVICE):
 		err = USB_ERR_IOERROR;
 		goto done;
 
 	case C(UR_SET_FEATURE, UT_WRITE_CLASS_DEVICE):
 		break;
 
 	case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER):
 
 		i = index >> 8;
 		index &= 0x00FF;
 
 		if ((index < 1) ||
 		    (index > sc->sc_noport)) {
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 
 		port = XHCI_PORTSC(index);
 		v = XREAD4(sc, oper, port) & ~XHCI_PS_CLEAR;
 
 		switch (value) {
 		case UHF_PORT_U1_TIMEOUT:
 			if (XHCI_PS_SPEED_GET(v) != 4) {
 				err = USB_ERR_IOERROR;
 				goto done;
 			}
 			port = XHCI_PORTPMSC(index);
 			v = XREAD4(sc, oper, port);
 			v &= ~XHCI_PM3_U1TO_SET(0xFF);
 			v |= XHCI_PM3_U1TO_SET(i);
 			XWRITE4(sc, oper, port, v);
 			break;
 		case UHF_PORT_U2_TIMEOUT:
 			if (XHCI_PS_SPEED_GET(v) != 4) {
 				err = USB_ERR_IOERROR;
 				goto done;
 			}
 			port = XHCI_PORTPMSC(index);
 			v = XREAD4(sc, oper, port);
 			v &= ~XHCI_PM3_U2TO_SET(0xFF);
 			v |= XHCI_PM3_U2TO_SET(i);
 			XWRITE4(sc, oper, port, v);
 			break;
 		case UHF_BH_PORT_RESET:
 			XWRITE4(sc, oper, port, v | XHCI_PS_WPR);
 			break;
 		case UHF_PORT_LINK_STATE:
 			XWRITE4(sc, oper, port, v |
 			    XHCI_PS_PLS_SET(i) | XHCI_PS_LWS);
 			/* 4ms settle time */
 			usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 250);
 			break;
 		case UHF_PORT_ENABLE:
 			DPRINTFN(3, "set port enable %d\n", index);
 			break;
 		case UHF_PORT_SUSPEND:
 			DPRINTFN(6, "suspend port %u (LPM=%u)\n", index, i);
 			j = XHCI_PS_SPEED_GET(v);
 			if ((j < 1) || (j > 3)) {
 				/* non-supported speed */
 				err = USB_ERR_IOERROR;
 				goto done;
 			}
 			XWRITE4(sc, oper, port, v |
 			    XHCI_PS_PLS_SET(i ? 2 /* LPM */ : 3) | XHCI_PS_LWS);
 			break;
 		case UHF_PORT_RESET:
 			DPRINTFN(6, "reset port %d\n", index);
 			XWRITE4(sc, oper, port, v | XHCI_PS_PR);
 			break;
 		case UHF_PORT_POWER:
 			DPRINTFN(3, "set port power %d\n", index);
 			XWRITE4(sc, oper, port, v | XHCI_PS_PP);
 			break;
 		case UHF_PORT_TEST:
 			DPRINTFN(3, "set port test %d\n", index);
 			break;
 		case UHF_PORT_INDICATOR:
 			DPRINTFN(3, "set port indicator %d\n", index);
 
 			v &= ~XHCI_PS_PIC_SET(3);
 			v |= XHCI_PS_PIC_SET(1);
 
 			XWRITE4(sc, oper, port, v);
 			break;
 		default:
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		break;
 
 	case C(UR_CLEAR_TT_BUFFER, UT_WRITE_CLASS_OTHER):
 	case C(UR_RESET_TT, UT_WRITE_CLASS_OTHER):
 	case C(UR_GET_TT_STATE, UT_READ_CLASS_OTHER):
 	case C(UR_STOP_TT, UT_WRITE_CLASS_OTHER):
 		break;
 	default:
 		err = USB_ERR_IOERROR;
 		goto done;
 	}
 done:
 	*plength = len;
 	*pptr = ptr;
 	return (err);
 }
 
 static void
 xhci_xfer_setup(struct usb_setup_params *parm)
 {
 	struct usb_page_search page_info;
 	struct usb_page_cache *pc;
 	struct usb_xfer *xfer;
 	void *last_obj;
 	uint32_t ntd;
 	uint32_t n;
 
 	xfer = parm->curr_xfer;
 
 	/*
 	 * The proof for the "ntd" formula is illustrated like this:
 	 *
 	 * +------------------------------------+
 	 * |                                    |
 	 * |         |remainder ->              |
 	 * |   +-----+---+                      |
 	 * |   | xxx | x | frm 0                |
 	 * |   +-----+---++                     |
 	 * |   | xxx | xx | frm 1               |
 	 * |   +-----+----+                     |
 	 * |            ...                     |
 	 * +------------------------------------+
 	 *
 	 * "xxx" means a completely full USB transfer descriptor
 	 *
 	 * "x" and "xx" means a short USB packet
 	 *
 	 * For the remainder of an USB transfer modulo
 	 * "max_data_length" we need two USB transfer descriptors.
 	 * One to transfer the remaining data and one to finalise with
 	 * a zero length packet in case the "force_short_xfer" flag is
 	 * set. We only need two USB transfer descriptors in the case
 	 * where the transfer length of the first one is a factor of
 	 * "max_frame_size". The rest of the needed USB transfer
 	 * descriptors is given by the buffer size divided by the
 	 * maximum data payload.
 	 */
 	parm->hc_max_packet_size = 0x400;
 	parm->hc_max_packet_count = 16 * 3;
 	parm->hc_max_frame_size = XHCI_TD_PAYLOAD_MAX;
 
 	xfer->flags_int.bdma_enable = 1;
 
 	usbd_transfer_setup_sub(parm);
 
 	if (xfer->flags_int.isochronous_xfr) {
 		ntd = ((1 * xfer->nframes)
 		    + (xfer->max_data_length / xfer->max_hc_frame_size));
 	} else if (xfer->flags_int.control_xfr) {
 		ntd = ((2 * xfer->nframes) + 1	/* STATUS */
 		    + (xfer->max_data_length / xfer->max_hc_frame_size));
 	} else {
 		ntd = ((2 * xfer->nframes)
 		    + (xfer->max_data_length / xfer->max_hc_frame_size));
 	}
 
 alloc_dma_set:
 
 	if (parm->err)
 		return;
 
 	/*
 	 * Allocate queue heads and transfer descriptors
 	 */
 	last_obj = NULL;
 
 	if (usbd_transfer_setup_sub_malloc(
 	    parm, &pc, sizeof(struct xhci_td),
 	    XHCI_TD_ALIGN, ntd)) {
 		parm->err = USB_ERR_NOMEM;
 		return;
 	}
 	if (parm->buf) {
 		for (n = 0; n != ntd; n++) {
 			struct xhci_td *td;
 
 			usbd_get_page(pc + n, 0, &page_info);
 
 			td = page_info.buffer;
 
 			/* init TD */
 			td->td_self = page_info.physaddr;
 			td->obj_next = last_obj;
 			td->page_cache = pc + n;
 
 			last_obj = td;
 
 			usb_pc_cpu_flush(pc + n);
 		}
 	}
 	xfer->td_start[xfer->flags_int.curr_dma_set] = last_obj;
 
 	if (!xfer->flags_int.curr_dma_set) {
 		xfer->flags_int.curr_dma_set = 1;
 		goto alloc_dma_set;
 	}
 }
 
 static usb_error_t
 xhci_configure_reset_endpoint(struct usb_xfer *xfer)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
 	struct usb_page_search buf_inp;
 	struct usb_device *udev;
 	struct xhci_endpoint_ext *pepext;
 	struct usb_endpoint_descriptor *edesc;
 	struct usb_page_cache *pcinp;
 	usb_error_t err;
 	usb_stream_t stream_id;
 	uint8_t index;
 	uint8_t epno;
 
 	pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
 	    xfer->endpoint->edesc);
 
 	udev = xfer->xroot->udev;
 	index = udev->controller_slot_id;
 
 	pcinp = &sc->sc_hw.devs[index].input_pc;
 
 	usbd_get_page(pcinp, 0, &buf_inp);
 
 	edesc = xfer->endpoint->edesc;
 
 	epno = edesc->bEndpointAddress;
 	stream_id = xfer->stream_id;
 
 	if ((edesc->bmAttributes & UE_XFERTYPE) == UE_CONTROL)
 		epno |= UE_DIR_IN;
 
 	epno = XHCI_EPNO2EPID(epno);
 
  	if (epno == 0)
 		return (USB_ERR_NO_PIPE);		/* invalid */
 
 	XHCI_CMD_LOCK(sc);
 
 	/* configure endpoint */
 
 	err = xhci_configure_endpoint_by_xfer(xfer);
 
 	if (err != 0) {
 		XHCI_CMD_UNLOCK(sc);
 		return (err);
 	}
 
 	/*
 	 * Get the endpoint into the stopped state according to the
 	 * endpoint context state diagram in the XHCI specification:
 	 */
 
 	err = xhci_cmd_stop_ep(sc, 0, epno, index);
 
 	if (err != 0)
 		DPRINTF("Could not stop endpoint %u\n", epno);
 
 	err = xhci_cmd_reset_ep(sc, 0, epno, index);
 
 	if (err != 0)
 		DPRINTF("Could not reset endpoint %u\n", epno);
 
 	err = xhci_cmd_set_tr_dequeue_ptr(sc,
 	    (pepext->physaddr + (stream_id * sizeof(struct xhci_trb) *
 	    XHCI_MAX_TRANSFERS)) | XHCI_EPCTX_2_DCS_SET(1),
 	    stream_id, epno, index);
 
 	if (err != 0)
 		DPRINTF("Could not set dequeue ptr for endpoint %u\n", epno);
 
 	/*
 	 * Get the endpoint into the running state according to the
 	 * endpoint context state diagram in the XHCI specification:
 	 */
 
 	xhci_configure_mask(udev, (1U << epno) | 1U, 0);
 
 	if (epno > 1)
 		err = xhci_cmd_configure_ep(sc, buf_inp.physaddr, 0, index);
 	else
 		err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index);
 
 	if (err != 0)
 		DPRINTF("Could not configure endpoint %u\n", epno);
 
 	XHCI_CMD_UNLOCK(sc);
 
 	return (0);
 }
 
 static void
 xhci_xfer_unsetup(struct usb_xfer *xfer)
 {
 	return;
 }
 
 static void
 xhci_start_dma_delay(struct usb_xfer *xfer)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
 
 	/* put transfer on interrupt queue (again) */
 	usbd_transfer_enqueue(&sc->sc_bus.intr_q, xfer);
 
 	(void)usb_proc_msignal(USB_BUS_CONTROL_XFER_PROC(&sc->sc_bus),
 	    &sc->sc_config_msg[0], &sc->sc_config_msg[1]);
 }
 
 static void
 xhci_configure_msg(struct usb_proc_msg *pm)
 {
 	struct xhci_softc *sc;
 	struct xhci_endpoint_ext *pepext;
 	struct usb_xfer *xfer;
 
 	sc = XHCI_BUS2SC(((struct usb_bus_msg *)pm)->bus);
 
 restart:
 	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
 
 		pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
 		    xfer->endpoint->edesc);
 
 		if ((pepext->trb_halted != 0) ||
 		    (pepext->trb_running == 0)) {
 
 			uint16_t i;
 
 			/* clear halted and running */
 			pepext->trb_halted = 0;
 			pepext->trb_running = 0;
 
 			/* nuke remaining buffered transfers */
 
 			for (i = 0; i != (XHCI_MAX_TRANSFERS *
 			    XHCI_MAX_STREAMS); i++) {
 				/*
 				 * NOTE: We need to use the timeout
 				 * error code here else existing
 				 * isochronous clients can get
 				 * confused:
 				 */
 				if (pepext->xfer[i] != NULL) {
 					xhci_device_done(pepext->xfer[i],
 					    USB_ERR_TIMEOUT);
 				}
 			}
 
 			/*
 			 * NOTE: The USB transfer cannot vanish in
 			 * this state!
 			 */
 
 			USB_BUS_UNLOCK(&sc->sc_bus);
 
 			xhci_configure_reset_endpoint(xfer);
 
 			USB_BUS_LOCK(&sc->sc_bus);
 
 			/* check if halted is still cleared */
 			if (pepext->trb_halted == 0) {
 				pepext->trb_running = 1;
 				memset(pepext->trb_index, 0,
 				    sizeof(pepext->trb_index));
 			}
 			goto restart;
 		}
 
 		if (xfer->flags_int.did_dma_delay) {
 
 			/* remove transfer from interrupt queue (again) */
 			usbd_transfer_dequeue(xfer);
 
 			/* we are finally done */
 			usb_dma_delay_done_cb(xfer);
 
 			/* queue changed - restart */
 			goto restart;
 		}
 	}
 
 	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
 
 		/* try to insert xfer on HW queue */
 		xhci_transfer_insert(xfer);
 
 		/* try to multi buffer */
 		xhci_device_generic_multi_enter(xfer->endpoint,
 		    xfer->stream_id, NULL);
 	}
 }
 
 static void
 xhci_ep_init(struct usb_device *udev, struct usb_endpoint_descriptor *edesc,
     struct usb_endpoint *ep)
 {
 	struct xhci_endpoint_ext *pepext;
 
 	DPRINTFN(2, "endpoint=%p, addr=%d, endpt=%d, mode=%d\n",
 	    ep, udev->address, edesc->bEndpointAddress, udev->flags.usb_mode);
 
 	if (udev->parent_hub == NULL) {
 		/* root HUB has special endpoint handling */
 		return;
 	}
 
 	ep->methods = &xhci_device_generic_methods;
 
 	pepext = xhci_get_endpoint_ext(udev, edesc);
 
 	USB_BUS_LOCK(udev->bus);
 	pepext->trb_halted = 1;
 	pepext->trb_running = 0;
 	USB_BUS_UNLOCK(udev->bus);
 }
 
 static void
 xhci_ep_uninit(struct usb_device *udev, struct usb_endpoint *ep)
 {
 
 }
 
 static void
 xhci_ep_clear_stall(struct usb_device *udev, struct usb_endpoint *ep)
 {
 	struct xhci_endpoint_ext *pepext;
 
 	DPRINTF("\n");
 
 	if (udev->flags.usb_mode != USB_MODE_HOST) {
 		/* not supported */
 		return;
 	}
 	if (udev->parent_hub == NULL) {
 		/* root HUB has special endpoint handling */
 		return;
 	}
 
 	pepext = xhci_get_endpoint_ext(udev, ep->edesc);
 
 	USB_BUS_LOCK(udev->bus);
 	pepext->trb_halted = 1;
 	pepext->trb_running = 0;
 	USB_BUS_UNLOCK(udev->bus);
 }
 
 static usb_error_t
 xhci_device_init(struct usb_device *udev)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	usb_error_t err;
 	uint8_t temp;
 
 	/* no init for root HUB */
 	if (udev->parent_hub == NULL)
 		return (0);
 
 	XHCI_CMD_LOCK(sc);
 
 	/* set invalid default */
 
 	udev->controller_slot_id = sc->sc_noslot + 1;
 
 	/* try to get a new slot ID from the XHCI */
 
 	err = xhci_cmd_enable_slot(sc, &temp);
 
 	if (err) {
 		XHCI_CMD_UNLOCK(sc);
 		return (err);
 	}
 
 	if (temp > sc->sc_noslot) {
 		XHCI_CMD_UNLOCK(sc);
 		return (USB_ERR_BAD_ADDRESS);
 	}
 
 	if (sc->sc_hw.devs[temp].state != XHCI_ST_DISABLED) {
 		DPRINTF("slot %u already allocated.\n", temp);
 		XHCI_CMD_UNLOCK(sc);
 		return (USB_ERR_BAD_ADDRESS);
 	}
 
 	/* store slot ID for later reference */
 
 	udev->controller_slot_id = temp;
 
 	/* reset data structure */
 
 	memset(&sc->sc_hw.devs[temp], 0, sizeof(sc->sc_hw.devs[0]));
 
 	/* set mark slot allocated */
 
 	sc->sc_hw.devs[temp].state = XHCI_ST_ENABLED;
 
 	err = xhci_alloc_device_ext(udev);
 
 	XHCI_CMD_UNLOCK(sc);
 
 	/* get device into default state */
 
 	if (err == 0)
 		err = xhci_set_address(udev, NULL, 0);
 
 	return (err);
 }
 
 static void
 xhci_device_uninit(struct usb_device *udev)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	uint8_t index;
 
 	/* no init for root HUB */
 	if (udev->parent_hub == NULL)
 		return;
 
 	XHCI_CMD_LOCK(sc);
 
 	index = udev->controller_slot_id;
 
 	if (index <= sc->sc_noslot) {
 		xhci_cmd_disable_slot(sc, index);
 		sc->sc_hw.devs[index].state = XHCI_ST_DISABLED;
 
 		/* free device extension */
 		xhci_free_device_ext(udev);
 	}
 
 	XHCI_CMD_UNLOCK(sc);
 }
 
 static void
 xhci_get_dma_delay(struct usb_device *udev, uint32_t *pus)
 {
 	/*
 	 * Wait until the hardware has finished any possible use of
 	 * the transfer descriptor(s)
 	 */
 	*pus = 2048;			/* microseconds */
 }
 
 static void
 xhci_device_resume(struct usb_device *udev)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	uint8_t index;
 	uint8_t n;
 	uint8_t p;
 
 	DPRINTF("\n");
 
 	/* check for root HUB */
 	if (udev->parent_hub == NULL)
 		return;
 
 	index = udev->controller_slot_id;
 
 	XHCI_CMD_LOCK(sc);
 
 	/* blindly resume all endpoints */
 
 	USB_BUS_LOCK(udev->bus);
 
 	for (n = 1; n != XHCI_MAX_ENDPOINTS; n++) {
 		for (p = 0; p != XHCI_MAX_STREAMS; p++) {
 			XWRITE4(sc, door, XHCI_DOORBELL(index),
 			    n | XHCI_DB_SID_SET(p));
 		}
 	}
 
 	USB_BUS_UNLOCK(udev->bus);
 
 	XHCI_CMD_UNLOCK(sc);
 }
 
 static void
 xhci_device_suspend(struct usb_device *udev)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	uint8_t index;
 	uint8_t n;
 	usb_error_t err;
 
 	DPRINTF("\n");
 
 	/* check for root HUB */
 	if (udev->parent_hub == NULL)
 		return;
 
 	index = udev->controller_slot_id;
 
 	XHCI_CMD_LOCK(sc);
 
 	/* blindly suspend all endpoints */
 
 	for (n = 1; n != XHCI_MAX_ENDPOINTS; n++) {
 		err = xhci_cmd_stop_ep(sc, 1, n, index);
 		if (err != 0) {
 			DPRINTF("Failed to suspend endpoint "
 			    "%u on slot %u (ignored).\n", n, index);
 		}
 	}
 
 	XHCI_CMD_UNLOCK(sc);
 }
 
 static void
 xhci_set_hw_power(struct usb_bus *bus)
 {
 	DPRINTF("\n");
 }
 
 static void
 xhci_device_state_change(struct usb_device *udev)
 {
 	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
 	struct usb_page_search buf_inp;
 	usb_error_t err;
 	uint8_t index;
 
 	/* check for root HUB */
 	if (udev->parent_hub == NULL)
 		return;
 
 	index = udev->controller_slot_id;
 
 	DPRINTF("\n");
 
 	if (usb_get_device_state(udev) == USB_STATE_CONFIGURED) {
 		err = uhub_query_info(udev, &sc->sc_hw.devs[index].nports, 
 		    &sc->sc_hw.devs[index].tt);
 		if (err != 0)
 			sc->sc_hw.devs[index].nports = 0;
 	}
 
 	XHCI_CMD_LOCK(sc);
 
 	switch (usb_get_device_state(udev)) {
 	case USB_STATE_POWERED:
 		if (sc->sc_hw.devs[index].state == XHCI_ST_DEFAULT)
 			break;
 
 		/* set default state */
 		sc->sc_hw.devs[index].state = XHCI_ST_DEFAULT;
 
 		/* reset number of contexts */
 		sc->sc_hw.devs[index].context_num = 0;
 
 		err = xhci_cmd_reset_dev(sc, index);
 
 		if (err != 0) {
 			DPRINTF("Device reset failed "
 			    "for slot %u.\n", index);
 		}
 		break;
 
 	case USB_STATE_ADDRESSED:
 		if (sc->sc_hw.devs[index].state == XHCI_ST_ADDRESSED)
 			break;
 
 		sc->sc_hw.devs[index].state = XHCI_ST_ADDRESSED;
 
 		/* set configure mask to slot only */
 		xhci_configure_mask(udev, 1, 0);
 
 		/* deconfigure all endpoints, except EP0 */
 		err = xhci_cmd_configure_ep(sc, 0, 1, index);
 
 		if (err) {
 			DPRINTF("Failed to deconfigure "
 			    "slot %u.\n", index);
 		}
 		break;
 
 	case USB_STATE_CONFIGURED:
 		if (sc->sc_hw.devs[index].state == XHCI_ST_CONFIGURED)
 			break;
 
 		/* set configured state */
 		sc->sc_hw.devs[index].state = XHCI_ST_CONFIGURED;
 
 		/* reset number of contexts */
 		sc->sc_hw.devs[index].context_num = 0;
 
 		usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);
 
 		xhci_configure_mask(udev, 3, 0);
 
 		err = xhci_configure_device(udev);
 		if (err != 0) {
 			DPRINTF("Could not configure device "
 			    "at slot %u.\n", index);
 		}
 
 		err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index);
 		if (err != 0) {
 			DPRINTF("Could not evaluate device "
 			    "context at slot %u.\n", index);
 		}
 		break;
 
 	default:
 		break;
 	}
 	XHCI_CMD_UNLOCK(sc);
 }
 
 static usb_error_t
 xhci_set_endpoint_mode(struct usb_device *udev, struct usb_endpoint *ep,
     uint8_t ep_mode)
 {
 	switch (ep_mode) {
 	case USB_EP_MODE_DEFAULT:
 		return (0);
 	case USB_EP_MODE_STREAMS:
 		if (xhcistreams == 0 || 
 		    (ep->edesc->bmAttributes & UE_XFERTYPE) != UE_BULK ||
 		    udev->speed != USB_SPEED_SUPER)
 			return (USB_ERR_INVAL);
 		return (0);
 	default:
 		return (USB_ERR_INVAL);
 	}
 }
 
 static const struct usb_bus_methods xhci_bus_methods = {
 	.endpoint_init = xhci_ep_init,
 	.endpoint_uninit = xhci_ep_uninit,
 	.xfer_setup = xhci_xfer_setup,
 	.xfer_unsetup = xhci_xfer_unsetup,
 	.get_dma_delay = xhci_get_dma_delay,
 	.device_init = xhci_device_init,
 	.device_uninit = xhci_device_uninit,
 	.device_resume = xhci_device_resume,
 	.device_suspend = xhci_device_suspend,
 	.set_hw_power = xhci_set_hw_power,
 	.roothub_exec = xhci_roothub_exec,
 	.xfer_poll = xhci_do_poll,
 	.start_dma_delay = xhci_start_dma_delay,
 	.set_address = xhci_set_address,
 	.clear_stall = xhci_ep_clear_stall,
 	.device_state_change = xhci_device_state_change,
 	.set_hw_power_sleep = xhci_set_hw_power_sleep,
 	.set_endpoint_mode = xhci_set_endpoint_mode,
 };
Index: projects/clang900-import/sys/dev/usb/usb_bus.h
===================================================================
--- projects/clang900-import/sys/dev/usb/usb_bus.h	(revision 352586)
+++ projects/clang900-import/sys/dev/usb/usb_bus.h	(revision 352587)
@@ -1,136 +1,137 @@
 /* $FreeBSD$ */
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _USB_BUS_H_
 #define	_USB_BUS_H_
 
 struct usb_fs_privdata;
 
 /*
  * The following structure defines the USB explore message sent to the USB
  * explore process.
  */
 
 struct usb_bus_msg {
 	struct usb_proc_msg hdr;
 	struct usb_bus *bus;
 };
 
 /*
  * The following structure defines the USB statistics structure.
  */
 struct usb_bus_stat {
 	uint32_t uds_requests[4];
 };
 
 /*
  * The following structure defines an USB BUS. There is one USB BUS
  * for every Host or Device controller.
  */
 struct usb_bus {
 	struct usb_bus_stat stats_err;
 	struct usb_bus_stat stats_ok;
 #if USB_HAVE_ROOT_MOUNT_HOLD
 	struct root_hold_token *bus_roothold;
 #endif
 
 /* convenience macros */
 #define	USB_BUS_TT_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus)
 #define	USB_BUS_CS_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus)
   
 #if USB_HAVE_PER_BUS_PROCESS
 #define	USB_BUS_GIANT_PROC(bus) (&(bus)->giant_callback_proc)
 #define	USB_BUS_NON_GIANT_ISOC_PROC(bus) (&(bus)->non_giant_isoc_callback_proc)
 #define	USB_BUS_NON_GIANT_BULK_PROC(bus) (&(bus)->non_giant_bulk_callback_proc)
 #define	USB_BUS_EXPLORE_PROC(bus) (&(bus)->explore_proc)
 #define	USB_BUS_CONTROL_XFER_PROC(bus) (&(bus)->control_xfer_proc)
 	/*
 	 * There are three callback processes. One for Giant locked
 	 * callbacks. One for non-Giant locked non-periodic callbacks
 	 * and one for non-Giant locked periodic callbacks. This
 	 * should avoid congestion and reduce response time in most
 	 * cases.
 	 */
 	struct usb_process giant_callback_proc;
 	struct usb_process non_giant_isoc_callback_proc;
 	struct usb_process non_giant_bulk_callback_proc;
 
 	/* Explore process */
 	struct usb_process explore_proc;
 
 	/* Control request process */
 	struct usb_process control_xfer_proc;
 #endif
 
 	struct usb_bus_msg explore_msg[2];
 	struct usb_bus_msg detach_msg[2];
 	struct usb_bus_msg attach_msg[2];
 	struct usb_bus_msg suspend_msg[2];
 	struct usb_bus_msg resume_msg[2];
 	struct usb_bus_msg reset_msg[2];
 	struct usb_bus_msg shutdown_msg[2];
 #if USB_HAVE_UGEN
 	struct usb_bus_msg cleanup_msg[2];
 	LIST_HEAD(,usb_fs_privdata) pd_cleanup_list;
 #endif
 	/*
 	 * This mutex protects the USB hardware:
 	 */
 	struct mtx bus_mtx;
 	struct mtx bus_spin_lock;
 	struct usb_xfer_queue intr_q;
 	struct usb_callout power_wdog;	/* power management */
 
 	device_t parent;
 	device_t bdev;			/* filled by HC driver */
 
 #if USB_HAVE_BUSDMA
 	struct usb_dma_parent_tag dma_parent_tag[1];
 	struct usb_dma_tag dma_tags[USB_BUS_DMA_TAG_MAX];
 #endif
 	const struct usb_bus_methods *methods;	/* filled by HC driver */
 	struct usb_device **devices;
 
 	struct ifnet *ifp;	/* only for USB Packet Filter */
 
 	usb_power_mask_t hw_power_state;	/* see USB_HW_POWER_XXX */
 	usb_size_t uframe_usage[USB_HS_MICRO_FRAMES_MAX];
 
 	uint16_t isoc_time_last;	/* in milliseconds */
 
 	uint8_t	alloc_failed;		/* Set if memory allocation failed. */
 	uint8_t	driver_added_refcount;	/* Current driver generation count */
 	enum usb_revision usbrev;	/* USB revision. See "USB_REV_XXX". */
 
 	uint8_t	devices_max;		/* maximum number of USB devices */
 	uint8_t	do_probe;		/* set if USB should be re-probed */
 	uint8_t no_explore;		/* don't explore USB ports */
 	uint8_t dma_bits;		/* number of DMA address lines */
+	uint8_t control_ep_quirk;	/* need 64kByte buffer for data stage */
 };
 
 #endif					/* _USB_BUS_H_ */
Index: projects/clang900-import/sys/dev/usb/usb_ioctl.h
===================================================================
--- projects/clang900-import/sys/dev/usb/usb_ioctl.h	(revision 352586)
+++ projects/clang900-import/sys/dev/usb/usb_ioctl.h	(revision 352587)
@@ -1,349 +1,349 @@
 /* $FreeBSD$ */
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
  * Copyright (c) 1998 The NetBSD Foundation, Inc. All rights reserved.
  * Copyright (c) 1998 Lennart Augustsson. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _USB_IOCTL_H_
 #define	_USB_IOCTL_H_
 
 #ifndef USB_GLOBAL_INCLUDE_FILE
 #include <sys/ioccom.h>
 #include <sys/cdefs.h>
 
 /* Building "kdump" depends on these includes */
 
 #include <dev/usb/usb_endian.h>
 #include <dev/usb/usb.h>
 #endif
 
 #define	USB_DEVICE_NAME "usbctl"
 #define	USB_DEVICE_DIR "usb"
 #define	USB_GENERIC_NAME "ugen"
 #define	USB_TEMPLATE_SYSCTL "hw.usb.template"	/* integer type */
 
 /*
  * Align IOCTL structures to hide differences when running 32-bit
  * programs under 64-bit kernels:
  */
 #ifdef COMPAT_32BIT
 #define	USB_IOCTL_STRUCT_ALIGN(n) __aligned(n)
 #else
 #define	USB_IOCTL_STRUCT_ALIGN(n)
 #endif
 
 /* Definition of valid template sysctl values */
 
 enum {
 	USB_TEMP_MSC,		/* USB Mass Storage */
 	USB_TEMP_CDCE,		/* USB CDC Ethernet */
 	USB_TEMP_MTP,		/* Message Transfer Protocol */
 	USB_TEMP_MODEM,		/* USB CDC Modem */
 	USB_TEMP_AUDIO,		/* USB Audio */
 	USB_TEMP_KBD,		/* USB Keyboard */
 	USB_TEMP_MOUSE,		/* USB Mouse */
 	USB_TEMP_PHONE,		/* USB Phone */
 	USB_TEMP_SERIALNET,	/* USB CDC Ethernet and Modem */
 	USB_TEMP_MIDI,		/* USB MIDI */
 	USB_TEMP_MULTI,		/* USB Ethernet, serial, and storage */
 	USB_TEMP_CDCEEM,	/* USB Ethernet Emulation Model */
 	USB_TEMP_MAX,
 };
 
 struct usb_read_dir {
 #ifdef COMPAT_32BIT
 	uint64_t urd_data;
 #else
 	void   *urd_data;
 #endif
 	uint32_t urd_startentry;
 	uint32_t urd_maxlen;
 } USB_IOCTL_STRUCT_ALIGN(8);
 
 struct usb_ctl_request {
 #ifdef COMPAT_32BIT
 	uint64_t ucr_data;
 #else
 	void   *ucr_data;
 #endif
 	uint16_t ucr_flags;
 	uint16_t ucr_actlen;		/* actual length transferred */
 	uint8_t	ucr_addr;		/* zero - currently not used */
 	struct usb_device_request ucr_request;
 } USB_IOCTL_STRUCT_ALIGN(8);
 
 struct usb_alt_interface {
 	uint8_t	uai_interface_index;
 	uint8_t	uai_alt_index;
 } USB_IOCTL_STRUCT_ALIGN(1);
 
 struct usb_gen_descriptor {
 #ifdef COMPAT_32BIT
 	uint64_t ugd_data;
 #else
 	void   *ugd_data;
 #endif
 	uint16_t ugd_lang_id;
 	uint16_t ugd_maxlen;
 	uint16_t ugd_actlen;
 	uint16_t ugd_offset;
 	uint8_t	ugd_config_index;
 	uint8_t	ugd_string_index;
 	uint8_t	ugd_iface_index;
 	uint8_t	ugd_altif_index;
 	uint8_t	ugd_endpt_index;
 	uint8_t	ugd_report_type;
 	uint8_t	reserved[8];
 } USB_IOCTL_STRUCT_ALIGN(8);
 
 struct usb_device_info {
 	uint16_t udi_productNo;
 	uint16_t udi_vendorNo;
 	uint16_t udi_releaseNo;
 	uint16_t udi_power;		/* power consumption in mA, 0 if
 					 * selfpowered */
 	uint8_t	udi_bus;
 	uint8_t	udi_addr;		/* device address */
 	uint8_t	udi_index;		/* device index */
 	uint8_t	udi_class;
 	uint8_t	udi_subclass;
 	uint8_t	udi_protocol;
 	uint8_t	udi_config_no;		/* current config number */
 	uint8_t	udi_config_index;	/* current config index */
 	uint8_t	udi_speed;		/* see "USB_SPEED_XXX" */
 	uint8_t	udi_mode;		/* see "USB_MODE_XXX" */
 	uint8_t	udi_nports;
 	uint8_t	udi_hubaddr;		/* parent HUB address */
 	uint8_t	udi_hubindex;		/* parent HUB device index */
 	uint8_t	udi_hubport;		/* parent HUB port */
 	uint8_t	udi_power_mode;		/* see "USB_POWER_MODE_XXX" */
 	uint8_t	udi_suspended;		/* set if device is suspended */
 	uint8_t	udi_reserved[16];	/* leave space for the future */
 	char	udi_product[128];
 	char	udi_vendor[128];
 	char	udi_serial[64];
 	char	udi_release[8];
 } USB_IOCTL_STRUCT_ALIGN(2);
 
 #define	USB_DEVICE_PORT_PATH_MAX 32
 
 struct usb_device_port_path {
 	uint8_t udp_bus;		/* which bus we are on */
 	uint8_t udp_index;		/* which device index */
 	uint8_t udp_port_level;		/* how many levels: 0, 1, 2 ... */
 	uint8_t udp_port_no[USB_DEVICE_PORT_PATH_MAX];
 } USB_IOCTL_STRUCT_ALIGN(1);
 
 struct usb_device_stats {
 	uint32_t uds_requests_ok[4];	/* Indexed by transfer type UE_XXX */
 	uint32_t uds_requests_fail[4];	/* Indexed by transfer type UE_XXX */
 } USB_IOCTL_STRUCT_ALIGN(4);
 
 struct usb_fs_start {
 	uint8_t	ep_index;
 } USB_IOCTL_STRUCT_ALIGN(1);
 
 struct usb_fs_stop {
 	uint8_t	ep_index;
 } USB_IOCTL_STRUCT_ALIGN(1);
 
 struct usb_fs_complete {
 	uint8_t	ep_index;
 } USB_IOCTL_STRUCT_ALIGN(1);
 
 /* This structure is used for all endpoint types */
 struct usb_fs_endpoint {
 	/*
 	 * NOTE: isochronous USB transfer only use one buffer, but can have
 	 * multiple frame lengths !
 	 */
 #ifdef COMPAT_32BIT
 	uint64_t ppBuffer;
 	uint64_t pLength;
 #else
 	void  **ppBuffer;		/* pointer to userland buffers */
 	uint32_t *pLength;		/* pointer to frame lengths, updated
 					 * to actual length */
 #endif
 	uint32_t nFrames;		/* number of frames */
 	uint32_t aFrames;		/* actual number of frames */
 	uint16_t flags;
 	/* a single short frame will terminate */
 #define	USB_FS_FLAG_SINGLE_SHORT_OK 0x0001
 	/* multiple short frames are allowed */
 #define	USB_FS_FLAG_MULTI_SHORT_OK 0x0002
 	/* all frame(s) transmitted are short terminated */
 #define	USB_FS_FLAG_FORCE_SHORT 0x0004
 	/* will do a clear-stall before xfer */
 #define	USB_FS_FLAG_CLEAR_STALL 0x0008
 	uint16_t timeout;		/* in milliseconds */
 	/* isocronous completion time in milliseconds - used for echo cancel */
 	uint16_t isoc_time_complete;
 	/* timeout value for no timeout */
 #define	USB_FS_TIMEOUT_NONE 0
 	int	status;			/* see USB_ERR_XXX */
 } USB_IOCTL_STRUCT_ALIGN(8);
 
 struct usb_fs_init {
 	/* userland pointer to endpoints structure */
 #ifdef COMPAT_32BIT
 	uint64_t pEndpoints;
 #else
 	struct usb_fs_endpoint *pEndpoints;
 #endif
 	/* maximum number of endpoints */
 	uint8_t	ep_index_max;
 } USB_IOCTL_STRUCT_ALIGN(8);
 
 struct usb_fs_uninit {
 	uint8_t	dummy;			/* zero */
 } USB_IOCTL_STRUCT_ALIGN(1);
 
 struct usb_fs_open {
-#define	USB_FS_MAX_BUFSIZE (1 << 18)
+#define	USB_FS_MAX_BUFSIZE (1 << 25)	/* 32 MBytes */
 	uint32_t max_bufsize;
 #define	USB_FS_MAX_FRAMES		(1U << 12)
 #define	USB_FS_MAX_FRAMES_PRE_SCALE	(1U << 31)	/* for ISOCHRONOUS transfers */
 	uint32_t max_frames;		/* read and write */
 	uint16_t max_packet_length;	/* read only */
 	uint8_t	dev_index;		/* currently unused */
 	uint8_t	ep_index;
 	uint8_t	ep_no;			/* bEndpointNumber */
 } USB_IOCTL_STRUCT_ALIGN(4);
 
 struct usb_fs_open_stream {
 	struct usb_fs_open fs_open;
 	uint16_t stream_id;		/* stream ID */
 } USB_IOCTL_STRUCT_ALIGN(4);
 
 struct usb_fs_close {
 	uint8_t	ep_index;
 } USB_IOCTL_STRUCT_ALIGN(1);
 
 struct usb_fs_clear_stall_sync {
 	uint8_t	ep_index;
 } USB_IOCTL_STRUCT_ALIGN(1);
 
 struct usb_gen_quirk {
 	uint16_t index;			/* Quirk Index */
 	uint16_t vid;			/* Vendor ID */
 	uint16_t pid;			/* Product ID */
 	uint16_t bcdDeviceLow;		/* Low Device Revision */
 	uint16_t bcdDeviceHigh;		/* High Device Revision */
 	uint16_t reserved[2];
 	/*
 	 * String version of quirk including terminating zero. See
 	 * UQ_XXX in "usb_quirk.h".
 	 */
 	char	quirkname[64 - 14];
 } USB_IOCTL_STRUCT_ALIGN(2);
 
 /* USB controller */
 #define	USB_REQUEST		_IOWR('U', 1, struct usb_ctl_request)
 #define	USB_SETDEBUG		_IOW ('U', 2, int)
 #define	USB_DISCOVER		_IO  ('U', 3)
 #define	USB_DEVICEINFO		_IOWR('U', 4, struct usb_device_info)
 #define	USB_DEVICESTATS		_IOR ('U', 5, struct usb_device_stats)
 #define	USB_DEVICEENUMERATE	_IOW ('U', 6, int)
 
 /* Generic HID device */
 #define	USB_GET_REPORT_DESC	_IOWR('U', 21, struct usb_gen_descriptor)
 #define	USB_SET_IMMED		_IOW ('U', 22, int)
 #define	USB_GET_REPORT		_IOWR('U', 23, struct usb_gen_descriptor)
 #define	USB_SET_REPORT		_IOW ('U', 24, struct usb_gen_descriptor)
 #define	USB_GET_REPORT_ID	_IOR ('U', 25, int)
 
 /* Generic USB device */
 #define	USB_GET_CONFIG		_IOR ('U', 100, int)
 #define	USB_SET_CONFIG		_IOW ('U', 101, int)
 #define	USB_GET_ALTINTERFACE	_IOWR('U', 102, struct usb_alt_interface)
 #define	USB_SET_ALTINTERFACE	_IOWR('U', 103, struct usb_alt_interface)
 #define	USB_GET_DEVICE_DESC	_IOR ('U', 105, struct usb_device_descriptor)
 #define	USB_GET_CONFIG_DESC	_IOR ('U', 106, struct usb_config_descriptor)
 #define	USB_GET_RX_INTERFACE_DESC _IOR ('U', 107, struct usb_interface_descriptor)
 #define	USB_GET_RX_ENDPOINT_DESC _IOR ('U', 108, struct usb_endpoint_descriptor)
 #define	USB_GET_FULL_DESC	_IOWR('U', 109, struct usb_gen_descriptor)
 #define	USB_GET_STRING_DESC	_IOWR('U', 110, struct usb_gen_descriptor)
 #define	USB_DO_REQUEST		_IOWR('U', 111, struct usb_ctl_request)
 #define	USB_GET_DEVICEINFO	_IOR ('U', 112, struct usb_device_info)
 #define	USB_SET_RX_SHORT_XFER	_IOW ('U', 113, int)
 #define	USB_SET_RX_TIMEOUT	_IOW ('U', 114, int)
 #define	USB_GET_RX_FRAME_SIZE	_IOR ('U', 115, int)
 #define	USB_GET_RX_BUFFER_SIZE	_IOR ('U', 117, int)
 #define	USB_SET_RX_BUFFER_SIZE	_IOW ('U', 118, int)
 #define	USB_SET_RX_STALL_FLAG	_IOW ('U', 119, int)
 #define	USB_SET_TX_STALL_FLAG	_IOW ('U', 120, int)
 #define	USB_GET_IFACE_DRIVER	_IOWR('U', 121, struct usb_gen_descriptor)
 #define	USB_CLAIM_INTERFACE	_IOW ('U', 122, int)
 #define	USB_RELEASE_INTERFACE	_IOW ('U', 123, int)
 #define	USB_IFACE_DRIVER_ACTIVE	_IOW ('U', 124, int)
 #define	USB_IFACE_DRIVER_DETACH	_IOW ('U', 125, int)
 #define	USB_GET_PLUGTIME	_IOR ('U', 126, uint32_t)
 #define	USB_READ_DIR		_IOW ('U', 127, struct usb_read_dir)
 /* 128 - 133 unused */
 #define	USB_GET_DEV_PORT_PATH	_IOR ('U', 134, struct usb_device_port_path)
 #define	USB_GET_POWER_USAGE	_IOR ('U', 135, int)
 #define	USB_SET_TX_FORCE_SHORT	_IOW ('U', 136, int)
 #define	USB_SET_TX_TIMEOUT	_IOW ('U', 137, int)
 #define	USB_GET_TX_FRAME_SIZE	_IOR ('U', 138, int)
 #define	USB_GET_TX_BUFFER_SIZE	_IOR ('U', 139, int)
 #define	USB_SET_TX_BUFFER_SIZE	_IOW ('U', 140, int)
 #define	USB_GET_TX_INTERFACE_DESC _IOR ('U', 141, struct usb_interface_descriptor)
 #define	USB_GET_TX_ENDPOINT_DESC _IOR ('U', 142, struct usb_endpoint_descriptor)
 #define	USB_SET_PORT_ENABLE	_IOW ('U', 143, int)
 #define	USB_SET_PORT_DISABLE	_IOW ('U', 144, int)
 #define	USB_SET_POWER_MODE	_IOW ('U', 145, int)
 #define	USB_GET_POWER_MODE	_IOR ('U', 146, int)
 #define	USB_SET_TEMPLATE	_IOW ('U', 147, int)
 #define	USB_GET_TEMPLATE	_IOR ('U', 148, int)
 
 /* Modem device */
 #define	USB_GET_CM_OVER_DATA	_IOR ('U', 180, int)
 #define	USB_SET_CM_OVER_DATA	_IOW ('U', 181, int)
 
 /* GPIO control */
 #define	USB_GET_GPIO		_IOR ('U', 182, int)
 #define	USB_SET_GPIO		_IOW ('U', 183, int)
 
 /* USB file system interface */
 #define	USB_FS_START		_IOW ('U', 192, struct usb_fs_start)
 #define	USB_FS_STOP		_IOW ('U', 193, struct usb_fs_stop)
 #define	USB_FS_COMPLETE		_IOR ('U', 194, struct usb_fs_complete)
 #define	USB_FS_INIT		_IOW ('U', 195, struct usb_fs_init)
 #define	USB_FS_UNINIT		_IOW ('U', 196, struct usb_fs_uninit)
 #define	USB_FS_OPEN		_IOWR('U', 197, struct usb_fs_open)
 #define	USB_FS_CLOSE		_IOW ('U', 198, struct usb_fs_close)
 #define	USB_FS_CLEAR_STALL_SYNC _IOW ('U', 199, struct usb_fs_clear_stall_sync)
 #define	USB_FS_OPEN_STREAM	_IOWR('U', 200, struct usb_fs_open_stream)
 
 /* USB quirk system interface */
 #define	USB_DEV_QUIRK_GET	_IOWR('Q', 0, struct usb_gen_quirk)
 #define	USB_QUIRK_NAME_GET	_IOWR('Q', 1, struct usb_gen_quirk)
 #define	USB_DEV_QUIRK_ADD	_IOW ('Q', 2, struct usb_gen_quirk)
 #define	USB_DEV_QUIRK_REMOVE	_IOW ('Q', 3, struct usb_gen_quirk)
 
 #endif					/* _USB_IOCTL_H_ */
Index: projects/clang900-import/sys/dev/usb/usb_transfer.c
===================================================================
--- projects/clang900-import/sys/dev/usb/usb_transfer.c	(revision 352586)
+++ projects/clang900-import/sys/dev/usb/usb_transfer.c	(revision 352587)
@@ -1,3556 +1,3585 @@
 /* $FreeBSD$ */
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifdef USB_GLOBAL_INCLUDE_FILE
 #include USB_GLOBAL_INCLUDE_FILE
 #else
 #include <sys/stdint.h>
 #include <sys/stddef.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 
 #include <dev/usb/usb.h>
 #include <dev/usb/usbdi.h>
 #include <dev/usb/usbdi_util.h>
 
 #define	USB_DEBUG_VAR usb_debug
 
 #include <dev/usb/usb_core.h>
 #include <dev/usb/usb_busdma.h>
 #include <dev/usb/usb_process.h>
 #include <dev/usb/usb_transfer.h>
 #include <dev/usb/usb_device.h>
 #include <dev/usb/usb_debug.h>
 #include <dev/usb/usb_util.h>
 
 #include <dev/usb/usb_controller.h>
 #include <dev/usb/usb_bus.h>
 #include <dev/usb/usb_pf.h>
 #endif			/* USB_GLOBAL_INCLUDE_FILE */
 
 struct usb_std_packet_size {
 	struct {
 		uint16_t min;		/* inclusive */
 		uint16_t max;		/* inclusive */
 	}	range;
 
 	uint16_t fixed[4];
 };
 
 static usb_callback_t usb_request_callback;
 
 static const struct usb_config usb_control_ep_cfg[USB_CTRL_XFER_MAX] = {
 
 	/* This transfer is used for generic control endpoint transfers */
 
 	[0] = {
 		.type = UE_CONTROL,
 		.endpoint = 0x00,	/* Control endpoint */
 		.direction = UE_DIR_ANY,
 		.bufsize = USB_EP0_BUFSIZE,	/* bytes */
 		.flags = {.proxy_buffer = 1,},
 		.callback = &usb_request_callback,
 		.usb_mode = USB_MODE_DUAL,	/* both modes */
 	},
 
 	/* This transfer is used for generic clear stall only */
 
 	[1] = {
 		.type = UE_CONTROL,
 		.endpoint = 0x00,	/* Control pipe */
 		.direction = UE_DIR_ANY,
 		.bufsize = sizeof(struct usb_device_request),
 		.callback = &usb_do_clear_stall_callback,
 		.timeout = 1000,	/* 1 second */
 		.interval = 50,	/* 50ms */
 		.usb_mode = USB_MODE_HOST,
 	},
 };
 
+static const struct usb_config usb_control_ep_quirk_cfg[USB_CTRL_XFER_MAX] = {
+
+	/* This transfer is used for generic control endpoint transfers */
+
+	[0] = {
+		.type = UE_CONTROL,
+		.endpoint = 0x00,	/* Control endpoint */
+		.direction = UE_DIR_ANY,
+		.bufsize = 65535,	/* bytes */
+		.callback = &usb_request_callback,
+		.usb_mode = USB_MODE_DUAL,	/* both modes */
+	},
+
+	/* This transfer is used for generic clear stall only */
+
+	[1] = {
+		.type = UE_CONTROL,
+		.endpoint = 0x00,	/* Control pipe */
+		.direction = UE_DIR_ANY,
+		.bufsize = sizeof(struct usb_device_request),
+		.callback = &usb_do_clear_stall_callback,
+		.timeout = 1000,	/* 1 second */
+		.interval = 50,	/* 50ms */
+		.usb_mode = USB_MODE_HOST,
+	},
+};
+
 /* function prototypes */
 
 static void	usbd_update_max_frame_size(struct usb_xfer *);
 static void	usbd_transfer_unsetup_sub(struct usb_xfer_root *, uint8_t);
 static void	usbd_control_transfer_init(struct usb_xfer *);
 static int	usbd_setup_ctrl_transfer(struct usb_xfer *);
 static void	usb_callback_proc(struct usb_proc_msg *);
 static void	usbd_callback_ss_done_defer(struct usb_xfer *);
 static void	usbd_callback_wrapper(struct usb_xfer_queue *);
 static void	usbd_transfer_start_cb(void *);
 static uint8_t	usbd_callback_wrapper_sub(struct usb_xfer *);
 static void	usbd_get_std_packet_size(struct usb_std_packet_size *ptr, 
 		    uint8_t type, enum usb_dev_speed speed);
 
 /*------------------------------------------------------------------------*
  *	usb_request_callback
  *------------------------------------------------------------------------*/
 static void
 usb_request_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	if (xfer->flags_int.usb_mode == USB_MODE_DEVICE)
 		usb_handle_request_callback(xfer, error);
 	else
 		usbd_do_request_callback(xfer, error);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_update_max_frame_size
  *
  * This function updates the maximum frame size, hence high speed USB
  * can transfer multiple consecutive packets.
  *------------------------------------------------------------------------*/
 static void
 usbd_update_max_frame_size(struct usb_xfer *xfer)
 {
 	/* compute maximum frame size */
 	/* this computation should not overflow 16-bit */
 	/* max = 15 * 1024 */
 
 	xfer->max_frame_size = xfer->max_packet_size * xfer->max_packet_count;
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_get_dma_delay
  *
  * The following function is called when we need to
  * synchronize with DMA hardware.
  *
  * Returns:
  *    0: no DMA delay required
  * Else: milliseconds of DMA delay
  *------------------------------------------------------------------------*/
 usb_timeout_t
 usbd_get_dma_delay(struct usb_device *udev)
 {
 	const struct usb_bus_methods *mtod;
 	uint32_t temp;
 
 	mtod = udev->bus->methods;
 	temp = 0;
 
 	if (mtod->get_dma_delay) {
 		(mtod->get_dma_delay) (udev, &temp);
 		/*
 		 * Round up and convert to milliseconds. Note that we use
 		 * 1024 milliseconds per second. to save a division.
 		 */
 		temp += 0x3FF;
 		temp /= 0x400;
 	}
 	return (temp);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_setup_sub_malloc
  *
  * This function will allocate one or more DMA'able memory chunks
  * according to "size", "align" and "count" arguments. "ppc" is
  * pointed to a linear array of USB page caches afterwards.
  *
  * If the "align" argument is equal to "1" a non-contiguous allocation
  * can happen. Else if the "align" argument is greater than "1", the
  * allocation will always be contiguous in memory.
  *
  * Returns:
  *    0: Success
  * Else: Failure
  *------------------------------------------------------------------------*/
 #if USB_HAVE_BUSDMA
 uint8_t
 usbd_transfer_setup_sub_malloc(struct usb_setup_params *parm,
     struct usb_page_cache **ppc, usb_size_t size, usb_size_t align,
     usb_size_t count)
 {
 	struct usb_page_cache *pc;
 	struct usb_page *pg;
 	void *buf;
 	usb_size_t n_dma_pc;
 	usb_size_t n_dma_pg;
 	usb_size_t n_obj;
 	usb_size_t x;
 	usb_size_t y;
 	usb_size_t r;
 	usb_size_t z;
 
 	USB_ASSERT(align > 0, ("Invalid alignment, 0x%08x\n",
 	    align));
 	USB_ASSERT(size > 0, ("Invalid size = 0\n"));
 
 	if (count == 0) {
 		return (0);		/* nothing to allocate */
 	}
 	/*
 	 * Make sure that the size is aligned properly.
 	 */
 	size = -((-size) & (-align));
 
 	/*
 	 * Try multi-allocation chunks to reduce the number of DMA
 	 * allocations, hence DMA allocations are slow.
 	 */
 	if (align == 1) {
 		/* special case - non-cached multi page DMA memory */
 		n_dma_pc = count;
 		n_dma_pg = (2 + (size / USB_PAGE_SIZE));
 		n_obj = 1;
 	} else if (size >= USB_PAGE_SIZE) {
 		n_dma_pc = count;
 		n_dma_pg = 1;
 		n_obj = 1;
 	} else {
 		/* compute number of objects per page */
 #ifdef USB_DMA_SINGLE_ALLOC
 		n_obj = 1;
 #else
 		n_obj = (USB_PAGE_SIZE / size);
 #endif
 		/*
 		 * Compute number of DMA chunks, rounded up
 		 * to nearest one:
 		 */
 		n_dma_pc = howmany(count, n_obj);
 		n_dma_pg = 1;
 	}
 
 	/*
 	 * DMA memory is allocated once, but mapped twice. That's why
 	 * there is one list for auto-free and another list for
 	 * non-auto-free which only holds the mapping and not the
 	 * allocation.
 	 */
 	if (parm->buf == NULL) {
 		/* reserve memory (auto-free) */
 		parm->dma_page_ptr += n_dma_pc * n_dma_pg;
 		parm->dma_page_cache_ptr += n_dma_pc;
 
 		/* reserve memory (no-auto-free) */
 		parm->dma_page_ptr += count * n_dma_pg;
 		parm->xfer_page_cache_ptr += count;
 		return (0);
 	}
 	for (x = 0; x != n_dma_pc; x++) {
 		/* need to initialize the page cache */
 		parm->dma_page_cache_ptr[x].tag_parent =
 		    &parm->curr_xfer->xroot->dma_parent_tag;
 	}
 	for (x = 0; x != count; x++) {
 		/* need to initialize the page cache */
 		parm->xfer_page_cache_ptr[x].tag_parent =
 		    &parm->curr_xfer->xroot->dma_parent_tag;
 	}
 
 	if (ppc != NULL) {
 		if (n_obj != 1)
 			*ppc = parm->xfer_page_cache_ptr;
 		else
 			*ppc = parm->dma_page_cache_ptr;
 	}
 	r = count;			/* set remainder count */
 	z = n_obj * size;		/* set allocation size */
 	pc = parm->xfer_page_cache_ptr;
 	pg = parm->dma_page_ptr;
 
 	if (n_obj == 1) {
 	    /*
 	     * Avoid mapping memory twice if only a single object
 	     * should be allocated per page cache:
 	     */
 	    for (x = 0; x != n_dma_pc; x++) {
 		if (usb_pc_alloc_mem(parm->dma_page_cache_ptr,
 		    pg, z, align)) {
 			return (1);	/* failure */
 		}
 		/* Make room for one DMA page cache and "n_dma_pg" pages */
 		parm->dma_page_cache_ptr++;
 		pg += n_dma_pg;
 	    }
 	} else {
 	    for (x = 0; x != n_dma_pc; x++) {
 
 		if (r < n_obj) {
 			/* compute last remainder */
 			z = r * size;
 			n_obj = r;
 		}
 		if (usb_pc_alloc_mem(parm->dma_page_cache_ptr,
 		    pg, z, align)) {
 			return (1);	/* failure */
 		}
 		/* Set beginning of current buffer */
 		buf = parm->dma_page_cache_ptr->buffer;
 		/* Make room for one DMA page cache and "n_dma_pg" pages */
 		parm->dma_page_cache_ptr++;
 		pg += n_dma_pg;
 
 		for (y = 0; (y != n_obj); y++, r--, pc++, pg += n_dma_pg) {
 
 			/* Load sub-chunk into DMA */
 			if (usb_pc_dmamap_create(pc, size)) {
 				return (1);	/* failure */
 			}
 			pc->buffer = USB_ADD_BYTES(buf, y * size);
 			pc->page_start = pg;
 
 			USB_MTX_LOCK(pc->tag_parent->mtx);
 			if (usb_pc_load_mem(pc, size, 1 /* synchronous */ )) {
 				USB_MTX_UNLOCK(pc->tag_parent->mtx);
 				return (1);	/* failure */
 			}
 			USB_MTX_UNLOCK(pc->tag_parent->mtx);
 		}
 	    }
 	}
 
 	parm->xfer_page_cache_ptr = pc;
 	parm->dma_page_ptr = pg;
 	return (0);
 }
 #endif
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_setup_sub - transfer setup subroutine
  *
  * This function must be called from the "xfer_setup" callback of the
  * USB Host or Device controller driver when setting up an USB
  * transfer. This function will setup correct packet sizes, buffer
  * sizes, flags and more, that are stored in the "usb_xfer"
  * structure.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_setup_sub(struct usb_setup_params *parm)
 {
 	enum {
 		REQ_SIZE = 8,
 		MIN_PKT = 8,
 	};
 	struct usb_xfer *xfer = parm->curr_xfer;
 	const struct usb_config *setup = parm->curr_setup;
 	struct usb_endpoint_ss_comp_descriptor *ecomp;
 	struct usb_endpoint_descriptor *edesc;
 	struct usb_std_packet_size std_size;
 	usb_frcount_t n_frlengths;
 	usb_frcount_t n_frbuffers;
 	usb_frcount_t x;
 	uint16_t maxp_old;
 	uint8_t type;
 	uint8_t zmps;
 
 	/*
 	 * Sanity check. The following parameters must be initialized before
 	 * calling this function.
 	 */
 	if ((parm->hc_max_packet_size == 0) ||
 	    (parm->hc_max_packet_count == 0) ||
 	    (parm->hc_max_frame_size == 0)) {
 		parm->err = USB_ERR_INVAL;
 		goto done;
 	}
 	edesc = xfer->endpoint->edesc;
 	ecomp = xfer->endpoint->ecomp;
 
 	type = (edesc->bmAttributes & UE_XFERTYPE);
 
 	xfer->flags = setup->flags;
 	xfer->nframes = setup->frames;
 	xfer->timeout = setup->timeout;
 	xfer->callback = setup->callback;
 	xfer->interval = setup->interval;
 	xfer->endpointno = edesc->bEndpointAddress;
 	xfer->max_packet_size = UGETW(edesc->wMaxPacketSize);
 	xfer->max_packet_count = 1;
 	/* make a shadow copy: */
 	xfer->flags_int.usb_mode = parm->udev->flags.usb_mode;
 
 	parm->bufsize = setup->bufsize;
 
 	switch (parm->speed) {
 	case USB_SPEED_HIGH:
 		switch (type) {
 		case UE_ISOCHRONOUS:
 		case UE_INTERRUPT:
 			xfer->max_packet_count +=
 			    (xfer->max_packet_size >> 11) & 3;
 
 			/* check for invalid max packet count */
 			if (xfer->max_packet_count > 3)
 				xfer->max_packet_count = 3;
 			break;
 		default:
 			break;
 		}
 		xfer->max_packet_size &= 0x7FF;
 		break;
 	case USB_SPEED_SUPER:
 		xfer->max_packet_count += (xfer->max_packet_size >> 11) & 3;
 
 		if (ecomp != NULL)
 			xfer->max_packet_count += ecomp->bMaxBurst;
 
 		if ((xfer->max_packet_count == 0) || 
 		    (xfer->max_packet_count > 16))
 			xfer->max_packet_count = 16;
 
 		switch (type) {
 		case UE_CONTROL:
 			xfer->max_packet_count = 1;
 			break;
 		case UE_ISOCHRONOUS:
 			if (ecomp != NULL) {
 				uint8_t mult;
 
 				mult = UE_GET_SS_ISO_MULT(
 				    ecomp->bmAttributes) + 1;
 				if (mult > 3)
 					mult = 3;
 
 				xfer->max_packet_count *= mult;
 			}
 			break;
 		default:
 			break;
 		}
 		xfer->max_packet_size &= 0x7FF;
 		break;
 	default:
 		break;
 	}
 	/* range check "max_packet_count" */
 
 	if (xfer->max_packet_count > parm->hc_max_packet_count) {
 		xfer->max_packet_count = parm->hc_max_packet_count;
 	}
 
 	/* store max packet size value before filtering */
 
 	maxp_old = xfer->max_packet_size;
 
 	/* filter "wMaxPacketSize" according to HC capabilities */
 
 	if ((xfer->max_packet_size > parm->hc_max_packet_size) ||
 	    (xfer->max_packet_size == 0)) {
 		xfer->max_packet_size = parm->hc_max_packet_size;
 	}
 	/* filter "wMaxPacketSize" according to standard sizes */
 
 	usbd_get_std_packet_size(&std_size, type, parm->speed);
 
 	if (std_size.range.min || std_size.range.max) {
 
 		if (xfer->max_packet_size < std_size.range.min) {
 			xfer->max_packet_size = std_size.range.min;
 		}
 		if (xfer->max_packet_size > std_size.range.max) {
 			xfer->max_packet_size = std_size.range.max;
 		}
 	} else {
 
 		if (xfer->max_packet_size >= std_size.fixed[3]) {
 			xfer->max_packet_size = std_size.fixed[3];
 		} else if (xfer->max_packet_size >= std_size.fixed[2]) {
 			xfer->max_packet_size = std_size.fixed[2];
 		} else if (xfer->max_packet_size >= std_size.fixed[1]) {
 			xfer->max_packet_size = std_size.fixed[1];
 		} else {
 			/* only one possibility left */
 			xfer->max_packet_size = std_size.fixed[0];
 		}
 	}
 
 	/*
 	 * Check if the max packet size was outside its allowed range
 	 * and clamped to a valid value:
 	 */
 	if (maxp_old != xfer->max_packet_size)
 		xfer->flags_int.maxp_was_clamped = 1;
 	
 	/* compute "max_frame_size" */
 
 	usbd_update_max_frame_size(xfer);
 
 	/* check interrupt interval and transfer pre-delay */
 
 	if (type == UE_ISOCHRONOUS) {
 
 		uint16_t frame_limit;
 
 		xfer->interval = 0;	/* not used, must be zero */
 		xfer->flags_int.isochronous_xfr = 1;	/* set flag */
 
 		if (xfer->timeout == 0) {
 			/*
 			 * set a default timeout in
 			 * case something goes wrong!
 			 */
 			xfer->timeout = 1000 / 4;
 		}
 		switch (parm->speed) {
 		case USB_SPEED_LOW:
 		case USB_SPEED_FULL:
 			frame_limit = USB_MAX_FS_ISOC_FRAMES_PER_XFER;
 			xfer->fps_shift = 0;
 			break;
 		default:
 			frame_limit = USB_MAX_HS_ISOC_FRAMES_PER_XFER;
 			xfer->fps_shift = edesc->bInterval;
 			if (xfer->fps_shift > 0)
 				xfer->fps_shift--;
 			if (xfer->fps_shift > 3)
 				xfer->fps_shift = 3;
 			if (xfer->flags.pre_scale_frames != 0)
 				xfer->nframes <<= (3 - xfer->fps_shift);
 			break;
 		}
 
 		if (xfer->nframes > frame_limit) {
 			/*
 			 * this is not going to work
 			 * cross hardware
 			 */
 			parm->err = USB_ERR_INVAL;
 			goto done;
 		}
 		if (xfer->nframes == 0) {
 			/*
 			 * this is not a valid value
 			 */
 			parm->err = USB_ERR_ZERO_NFRAMES;
 			goto done;
 		}
 	} else {
 
 		/*
 		 * If a value is specified use that else check the
 		 * endpoint descriptor!
 		 */
 		if (type == UE_INTERRUPT) {
 
 			uint32_t temp;
 
 			if (xfer->interval == 0) {
 
 				xfer->interval = edesc->bInterval;
 
 				switch (parm->speed) {
 				case USB_SPEED_LOW:
 				case USB_SPEED_FULL:
 					break;
 				default:
 					/* 125us -> 1ms */
 					if (xfer->interval < 4)
 						xfer->interval = 1;
 					else if (xfer->interval > 16)
 						xfer->interval = (1 << (16 - 4));
 					else
 						xfer->interval = 
 						    (1 << (xfer->interval - 4));
 					break;
 				}
 			}
 
 			if (xfer->interval == 0) {
 				/*
 				 * One millisecond is the smallest
 				 * interval we support:
 				 */
 				xfer->interval = 1;
 			}
 
 			xfer->fps_shift = 0;
 			temp = 1;
 
 			while ((temp != 0) && (temp < xfer->interval)) {
 				xfer->fps_shift++;
 				temp *= 2;
 			}
 
 			switch (parm->speed) {
 			case USB_SPEED_LOW:
 			case USB_SPEED_FULL:
 				break;
 			default:
 				xfer->fps_shift += 3;
 				break;
 			}
 		}
 	}
 
 	/*
 	 * NOTE: we do not allow "max_packet_size" or "max_frame_size"
 	 * to be equal to zero when setting up USB transfers, hence
 	 * this leads to a lot of extra code in the USB kernel.
 	 */
 
 	if ((xfer->max_frame_size == 0) ||
 	    (xfer->max_packet_size == 0)) {
 
 		zmps = 1;
 
 		if ((parm->bufsize <= MIN_PKT) &&
 		    (type != UE_CONTROL) &&
 		    (type != UE_BULK)) {
 
 			/* workaround */
 			xfer->max_packet_size = MIN_PKT;
 			xfer->max_packet_count = 1;
 			parm->bufsize = 0;	/* automatic setup length */
 			usbd_update_max_frame_size(xfer);
 
 		} else {
 			parm->err = USB_ERR_ZERO_MAXP;
 			goto done;
 		}
 
 	} else {
 		zmps = 0;
 	}
 
 	/*
 	 * check if we should setup a default
 	 * length:
 	 */
 
 	if (parm->bufsize == 0) {
 
 		parm->bufsize = xfer->max_frame_size;
 
 		if (type == UE_ISOCHRONOUS) {
 			parm->bufsize *= xfer->nframes;
 		}
 	}
 	/*
 	 * check if we are about to setup a proxy
 	 * type of buffer:
 	 */
 
 	if (xfer->flags.proxy_buffer) {
 
 		/* round bufsize up */
 
 		parm->bufsize += (xfer->max_frame_size - 1);
 
 		if (parm->bufsize < xfer->max_frame_size) {
 			/* length wrapped around */
 			parm->err = USB_ERR_INVAL;
 			goto done;
 		}
 		/* subtract remainder */
 
 		parm->bufsize -= (parm->bufsize % xfer->max_frame_size);
 
 		/* add length of USB device request structure, if any */
 
 		if (type == UE_CONTROL) {
 			parm->bufsize += REQ_SIZE;	/* SETUP message */
 		}
 	}
 	xfer->max_data_length = parm->bufsize;
 
 	/* Setup "n_frlengths" and "n_frbuffers" */
 
 	if (type == UE_ISOCHRONOUS) {
 		n_frlengths = xfer->nframes;
 		n_frbuffers = 1;
 	} else {
 
 		if (type == UE_CONTROL) {
 			xfer->flags_int.control_xfr = 1;
 			if (xfer->nframes == 0) {
 				if (parm->bufsize <= REQ_SIZE) {
 					/*
 					 * there will never be any data
 					 * stage
 					 */
 					xfer->nframes = 1;
 				} else {
 					xfer->nframes = 2;
 				}
 			}
 		} else {
 			if (xfer->nframes == 0) {
 				xfer->nframes = 1;
 			}
 		}
 
 		n_frlengths = xfer->nframes;
 		n_frbuffers = xfer->nframes;
 	}
 
 	/*
 	 * check if we have room for the
 	 * USB device request structure:
 	 */
 
 	if (type == UE_CONTROL) {
 
 		if (xfer->max_data_length < REQ_SIZE) {
 			/* length wrapped around or too small bufsize */
 			parm->err = USB_ERR_INVAL;
 			goto done;
 		}
 		xfer->max_data_length -= REQ_SIZE;
 	}
 	/*
 	 * Setup "frlengths" and shadow "frlengths" for keeping the
 	 * initial frame lengths when a USB transfer is complete. This
 	 * information is useful when computing isochronous offsets.
 	 */
 	xfer->frlengths = parm->xfer_length_ptr;
 	parm->xfer_length_ptr += 2 * n_frlengths;
 
 	/* setup "frbuffers" */
 	xfer->frbuffers = parm->xfer_page_cache_ptr;
 	parm->xfer_page_cache_ptr += n_frbuffers;
 
 	/* initialize max frame count */
 	xfer->max_frame_count = xfer->nframes;
 
 	/*
 	 * check if we need to setup
 	 * a local buffer:
 	 */
 
 	if (!xfer->flags.ext_buffer) {
 #if USB_HAVE_BUSDMA
 		struct usb_page_search page_info;
 		struct usb_page_cache *pc;
 
 		if (usbd_transfer_setup_sub_malloc(parm,
 		    &pc, parm->bufsize, 1, 1)) {
 			parm->err = USB_ERR_NOMEM;
 		} else if (parm->buf != NULL) {
 
 			usbd_get_page(pc, 0, &page_info);
 
 			xfer->local_buffer = page_info.buffer;
 
 			usbd_xfer_set_frame_offset(xfer, 0, 0);
 
 			if ((type == UE_CONTROL) && (n_frbuffers > 1)) {
 				usbd_xfer_set_frame_offset(xfer, REQ_SIZE, 1);
 			}
 		}
 #else
 		/* align data */
 		parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1));
 
 		if (parm->buf != NULL) {
 			xfer->local_buffer =
 			    USB_ADD_BYTES(parm->buf, parm->size[0]);
 
 			usbd_xfer_set_frame_offset(xfer, 0, 0);
 
 			if ((type == UE_CONTROL) && (n_frbuffers > 1)) {
 				usbd_xfer_set_frame_offset(xfer, REQ_SIZE, 1);
 			}
 		}
 		parm->size[0] += parm->bufsize;
 
 		/* align data again */
 		parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1));
 #endif
 	}
 	/*
 	 * Compute maximum buffer size
 	 */
 
 	if (parm->bufsize_max < parm->bufsize) {
 		parm->bufsize_max = parm->bufsize;
 	}
 #if USB_HAVE_BUSDMA
 	if (xfer->flags_int.bdma_enable) {
 		/*
 		 * Setup "dma_page_ptr".
 		 *
 		 * Proof for formula below:
 		 *
 		 * Assume there are three USB frames having length "a", "b" and
 		 * "c". These USB frames will at maximum need "z"
 		 * "usb_page" structures. "z" is given by:
 		 *
 		 * z = ((a / USB_PAGE_SIZE) + 2) + ((b / USB_PAGE_SIZE) + 2) +
 		 * ((c / USB_PAGE_SIZE) + 2);
 		 *
 		 * Constraining "a", "b" and "c" like this:
 		 *
 		 * (a + b + c) <= parm->bufsize
 		 *
 		 * We know that:
 		 *
 		 * z <= ((parm->bufsize / USB_PAGE_SIZE) + (3*2));
 		 *
 		 * Here is the general formula:
 		 */
 		xfer->dma_page_ptr = parm->dma_page_ptr;
 		parm->dma_page_ptr += (2 * n_frbuffers);
 		parm->dma_page_ptr += (parm->bufsize / USB_PAGE_SIZE);
 	}
 #endif
 	if (zmps) {
 		/* correct maximum data length */
 		xfer->max_data_length = 0;
 	}
 	/* subtract USB frame remainder from "hc_max_frame_size" */
 
 	xfer->max_hc_frame_size =
 	    (parm->hc_max_frame_size -
 	    (parm->hc_max_frame_size % xfer->max_frame_size));
 
 	if (xfer->max_hc_frame_size == 0) {
 		parm->err = USB_ERR_INVAL;
 		goto done;
 	}
 
 	/* initialize frame buffers */
 
 	if (parm->buf) {
 		for (x = 0; x != n_frbuffers; x++) {
 			xfer->frbuffers[x].tag_parent =
 			    &xfer->xroot->dma_parent_tag;
 #if USB_HAVE_BUSDMA
 			if (xfer->flags_int.bdma_enable &&
 			    (parm->bufsize_max > 0)) {
 
 				if (usb_pc_dmamap_create(
 				    xfer->frbuffers + x,
 				    parm->bufsize_max)) {
 					parm->err = USB_ERR_NOMEM;
 					goto done;
 				}
 			}
 #endif
 		}
 	}
 done:
 	if (parm->err) {
 		/*
 		 * Set some dummy values so that we avoid division by zero:
 		 */
 		xfer->max_hc_frame_size = 1;
 		xfer->max_frame_size = 1;
 		xfer->max_packet_size = 1;
 		xfer->max_data_length = 0;
 		xfer->nframes = 0;
 		xfer->max_frame_count = 0;
 	}
 }
 
 static uint8_t
 usbd_transfer_setup_has_bulk(const struct usb_config *setup_start,
     uint16_t n_setup)
 {
 	while (n_setup--) {
 		uint8_t type = setup_start[n_setup].type;
 		if (type == UE_BULK || type == UE_BULK_INTR ||
 		    type == UE_TYPE_ANY)
 			return (1);
 	}
 	return (0);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_setup - setup an array of USB transfers
  *
  * NOTE: You must always call "usbd_transfer_unsetup" after calling
  * "usbd_transfer_setup" if success was returned.
  *
  * The idea is that the USB device driver should pre-allocate all its
  * transfers by one call to this function.
  *
  * Return values:
  *    0: Success
  * Else: Failure
  *------------------------------------------------------------------------*/
 usb_error_t
 usbd_transfer_setup(struct usb_device *udev,
     const uint8_t *ifaces, struct usb_xfer **ppxfer,
     const struct usb_config *setup_start, uint16_t n_setup,
     void *priv_sc, struct mtx *xfer_mtx)
 {
 	const struct usb_config *setup_end = setup_start + n_setup;
 	const struct usb_config *setup;
 	struct usb_setup_params *parm;
 	struct usb_endpoint *ep;
 	struct usb_xfer_root *info;
 	struct usb_xfer *xfer;
 	void *buf = NULL;
 	usb_error_t error = 0;
 	uint16_t n;
 	uint16_t refcount;
 	uint8_t do_unlock;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "usbd_transfer_setup can sleep!");
 
 	/* do some checking first */
 
 	if (n_setup == 0) {
 		DPRINTFN(6, "setup array has zero length!\n");
 		return (USB_ERR_INVAL);
 	}
 	if (ifaces == NULL) {
 		DPRINTFN(6, "ifaces array is NULL!\n");
 		return (USB_ERR_INVAL);
 	}
 	if (xfer_mtx == NULL) {
 		DPRINTFN(6, "using global lock\n");
 		xfer_mtx = &Giant;
 	}
 
 	/* more sanity checks */
 
 	for (setup = setup_start, n = 0;
 	    setup != setup_end; setup++, n++) {
 		if (setup->bufsize == (usb_frlength_t)-1) {
 			error = USB_ERR_BAD_BUFSIZE;
 			DPRINTF("invalid bufsize\n");
 		}
 		if (setup->callback == NULL) {
 			error = USB_ERR_NO_CALLBACK;
 			DPRINTF("no callback\n");
 		}
 		ppxfer[n] = NULL;
 	}
 
 	if (error)
 		return (error);
 
 	/* Protect scratch area */
 	do_unlock = usbd_ctrl_lock(udev);
 
 	refcount = 0;
 	info = NULL;
 
 	parm = &udev->scratch.xfer_setup[0].parm;
 	memset(parm, 0, sizeof(*parm));
 
 	parm->udev = udev;
 	parm->speed = usbd_get_speed(udev);
 	parm->hc_max_packet_count = 1;
 
 	if (parm->speed >= USB_SPEED_MAX) {
 		parm->err = USB_ERR_INVAL;
 		goto done;
 	}
 	/* setup all transfers */
 
 	while (1) {
 
 		if (buf) {
 			/*
 			 * Initialize the "usb_xfer_root" structure,
 			 * which is common for all our USB transfers.
 			 */
 			info = USB_ADD_BYTES(buf, 0);
 
 			info->memory_base = buf;
 			info->memory_size = parm->size[0];
 
 #if USB_HAVE_BUSDMA
 			info->dma_page_cache_start = USB_ADD_BYTES(buf, parm->size[4]);
 			info->dma_page_cache_end = USB_ADD_BYTES(buf, parm->size[5]);
 #endif
 			info->xfer_page_cache_start = USB_ADD_BYTES(buf, parm->size[5]);
 			info->xfer_page_cache_end = USB_ADD_BYTES(buf, parm->size[2]);
 
 			cv_init(&info->cv_drain, "WDRAIN");
 
 			info->xfer_mtx = xfer_mtx;
 #if USB_HAVE_BUSDMA
 			usb_dma_tag_setup(&info->dma_parent_tag,
 			    parm->dma_tag_p, udev->bus->dma_parent_tag[0].tag,
 			    xfer_mtx, &usb_bdma_done_event, udev->bus->dma_bits,
 			    parm->dma_tag_max);
 #endif
 
 			info->bus = udev->bus;
 			info->udev = udev;
 
 			TAILQ_INIT(&info->done_q.head);
 			info->done_q.command = &usbd_callback_wrapper;
 #if USB_HAVE_BUSDMA
 			TAILQ_INIT(&info->dma_q.head);
 			info->dma_q.command = &usb_bdma_work_loop;
 #endif
 			info->done_m[0].hdr.pm_callback = &usb_callback_proc;
 			info->done_m[0].xroot = info;
 			info->done_m[1].hdr.pm_callback = &usb_callback_proc;
 			info->done_m[1].xroot = info;
 
 			/* 
 			 * In device side mode control endpoint
 			 * requests need to run from a separate
 			 * context, else there is a chance of
 			 * deadlock!
 			 */
-			if (setup_start == usb_control_ep_cfg)
+			if (setup_start == usb_control_ep_cfg ||
+			    setup_start == usb_control_ep_quirk_cfg)
 				info->done_p =
 				    USB_BUS_CONTROL_XFER_PROC(udev->bus);
 			else if (xfer_mtx == &Giant)
 				info->done_p =
 				    USB_BUS_GIANT_PROC(udev->bus);
 			else if (usbd_transfer_setup_has_bulk(setup_start, n_setup))
 				info->done_p =
 				    USB_BUS_NON_GIANT_BULK_PROC(udev->bus);
 			else
 				info->done_p =
 				    USB_BUS_NON_GIANT_ISOC_PROC(udev->bus);
 		}
 		/* reset sizes */
 
 		parm->size[0] = 0;
 		parm->buf = buf;
 		parm->size[0] += sizeof(info[0]);
 
 		for (setup = setup_start, n = 0;
 		    setup != setup_end; setup++, n++) {
 
 			/* skip USB transfers without callbacks: */
 			if (setup->callback == NULL) {
 				continue;
 			}
 			/* see if there is a matching endpoint */
 			ep = usbd_get_endpoint(udev,
 			    ifaces[setup->if_index], setup);
 
 			/*
 			 * Check that the USB PIPE is valid and that
 			 * the endpoint mode is proper.
 			 *
 			 * Make sure we don't allocate a streams
 			 * transfer when such a combination is not
 			 * valid.
 			 */
 			if ((ep == NULL) || (ep->methods == NULL) ||
 			    ((ep->ep_mode != USB_EP_MODE_STREAMS) &&
 			    (ep->ep_mode != USB_EP_MODE_DEFAULT)) ||
 			    (setup->stream_id != 0 &&
 			    (setup->stream_id >= USB_MAX_EP_STREAMS ||
 			    (ep->ep_mode != USB_EP_MODE_STREAMS)))) {
 				if (setup->flags.no_pipe_ok)
 					continue;
 				if ((setup->usb_mode != USB_MODE_DUAL) &&
 				    (setup->usb_mode != udev->flags.usb_mode))
 					continue;
 				parm->err = USB_ERR_NO_PIPE;
 				goto done;
 			}
 
 			/* align data properly */
 			parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1));
 
 			/* store current setup pointer */
 			parm->curr_setup = setup;
 
 			if (buf) {
 				/*
 				 * Common initialization of the
 				 * "usb_xfer" structure.
 				 */
 				xfer = USB_ADD_BYTES(buf, parm->size[0]);
 				xfer->address = udev->address;
 				xfer->priv_sc = priv_sc;
 				xfer->xroot = info;
 
 				usb_callout_init_mtx(&xfer->timeout_handle,
 				    &udev->bus->bus_mtx, 0);
 			} else {
 				/*
 				 * Setup a dummy xfer, hence we are
 				 * writing to the "usb_xfer"
 				 * structure pointed to by "xfer"
 				 * before we have allocated any
 				 * memory:
 				 */
 				xfer = &udev->scratch.xfer_setup[0].dummy;
 				memset(xfer, 0, sizeof(*xfer));
 				refcount++;
 			}
 
 			/* set transfer endpoint pointer */
 			xfer->endpoint = ep;
 
 			/* set transfer stream ID */
 			xfer->stream_id = setup->stream_id;
 
 			parm->size[0] += sizeof(xfer[0]);
 			parm->methods = xfer->endpoint->methods;
 			parm->curr_xfer = xfer;
 
 			/*
 			 * Call the Host or Device controller transfer
 			 * setup routine:
 			 */
 			(udev->bus->methods->xfer_setup) (parm);
 
 			/* check for error */
 			if (parm->err)
 				goto done;
 
 			if (buf) {
 				/*
 				 * Increment the endpoint refcount. This
 				 * basically prevents setting a new
 				 * configuration and alternate setting
 				 * when USB transfers are in use on
 				 * the given interface. Search the USB
 				 * code for "endpoint->refcount_alloc" if you
 				 * want more information.
 				 */
 				USB_BUS_LOCK(info->bus);
 				if (xfer->endpoint->refcount_alloc >= USB_EP_REF_MAX)
 					parm->err = USB_ERR_INVAL;
 
 				xfer->endpoint->refcount_alloc++;
 
 				if (xfer->endpoint->refcount_alloc == 0)
 					panic("usbd_transfer_setup(): Refcount wrapped to zero\n");
 				USB_BUS_UNLOCK(info->bus);
 
 				/*
 				 * Whenever we set ppxfer[] then we
 				 * also need to increment the
 				 * "setup_refcount":
 				 */
 				info->setup_refcount++;
 
 				/*
 				 * Transfer is successfully setup and
 				 * can be used:
 				 */
 				ppxfer[n] = xfer;
 			}
 
 			/* check for error */
 			if (parm->err)
 				goto done;
 		}
 
 		if (buf != NULL || parm->err != 0)
 			goto done;
 
 		/* if no transfers, nothing to do */
 		if (refcount == 0)
 			goto done;
 
 		/* align data properly */
 		parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1));
 
 		/* store offset temporarily */
 		parm->size[1] = parm->size[0];
 
 		/*
 		 * The number of DMA tags required depends on
 		 * the number of endpoints. The current estimate
 		 * for maximum number of DMA tags per endpoint
 		 * is three:
 		 * 1) for loading memory
 		 * 2) for allocating memory
 		 * 3) for fixing memory [UHCI]
 		 */
 		parm->dma_tag_max += 3 * MIN(n_setup, USB_EP_MAX);
 
 		/*
 		 * DMA tags for QH, TD, Data and more.
 		 */
 		parm->dma_tag_max += 8;
 
 		parm->dma_tag_p += parm->dma_tag_max;
 
 		parm->size[0] += ((uint8_t *)parm->dma_tag_p) -
 		    ((uint8_t *)0);
 
 		/* align data properly */
 		parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1));
 
 		/* store offset temporarily */
 		parm->size[3] = parm->size[0];
 
 		parm->size[0] += ((uint8_t *)parm->dma_page_ptr) -
 		    ((uint8_t *)0);
 
 		/* align data properly */
 		parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1));
 
 		/* store offset temporarily */
 		parm->size[4] = parm->size[0];
 
 		parm->size[0] += ((uint8_t *)parm->dma_page_cache_ptr) -
 		    ((uint8_t *)0);
 
 		/* store end offset temporarily */
 		parm->size[5] = parm->size[0];
 
 		parm->size[0] += ((uint8_t *)parm->xfer_page_cache_ptr) -
 		    ((uint8_t *)0);
 
 		/* store end offset temporarily */
 
 		parm->size[2] = parm->size[0];
 
 		/* align data properly */
 		parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1));
 
 		parm->size[6] = parm->size[0];
 
 		parm->size[0] += ((uint8_t *)parm->xfer_length_ptr) -
 		    ((uint8_t *)0);
 
 		/* align data properly */
 		parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1));
 
 		/* allocate zeroed memory */
 		buf = malloc(parm->size[0], M_USB, M_WAITOK | M_ZERO);
 
 		if (buf == NULL) {
 			parm->err = USB_ERR_NOMEM;
 			DPRINTFN(0, "cannot allocate memory block for "
 			    "configuration (%d bytes)\n",
 			    parm->size[0]);
 			goto done;
 		}
 		parm->dma_tag_p = USB_ADD_BYTES(buf, parm->size[1]);
 		parm->dma_page_ptr = USB_ADD_BYTES(buf, parm->size[3]);
 		parm->dma_page_cache_ptr = USB_ADD_BYTES(buf, parm->size[4]);
 		parm->xfer_page_cache_ptr = USB_ADD_BYTES(buf, parm->size[5]);
 		parm->xfer_length_ptr = USB_ADD_BYTES(buf, parm->size[6]);
 	}
 
 done:
 	if (buf) {
 		if (info->setup_refcount == 0) {
 			/*
 			 * "usbd_transfer_unsetup_sub" will unlock
 			 * the bus mutex before returning !
 			 */
 			USB_BUS_LOCK(info->bus);
 
 			/* something went wrong */
 			usbd_transfer_unsetup_sub(info, 0);
 		}
 	}
 
 	/* check if any errors happened */
 	if (parm->err)
 		usbd_transfer_unsetup(ppxfer, n_setup);
 
 	error = parm->err;
 
 	if (do_unlock)
 		usbd_ctrl_unlock(udev);
 
 	return (error);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_unsetup_sub - factored out code
  *------------------------------------------------------------------------*/
 static void
 usbd_transfer_unsetup_sub(struct usb_xfer_root *info, uint8_t needs_delay)
 {
 #if USB_HAVE_BUSDMA
 	struct usb_page_cache *pc;
 #endif
 
 	USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED);
 
 	/* wait for any outstanding DMA operations */
 
 	if (needs_delay) {
 		usb_timeout_t temp;
 		temp = usbd_get_dma_delay(info->udev);
 		if (temp != 0) {
 			usb_pause_mtx(&info->bus->bus_mtx,
 			    USB_MS_TO_TICKS(temp));
 		}
 	}
 
 	/* make sure that our done messages are not queued anywhere */
 	usb_proc_mwait(info->done_p, &info->done_m[0], &info->done_m[1]);
 
 	USB_BUS_UNLOCK(info->bus);
 
 #if USB_HAVE_BUSDMA
 	/* free DMA'able memory, if any */
 	pc = info->dma_page_cache_start;
 	while (pc != info->dma_page_cache_end) {
 		usb_pc_free_mem(pc);
 		pc++;
 	}
 
 	/* free DMA maps in all "xfer->frbuffers" */
 	pc = info->xfer_page_cache_start;
 	while (pc != info->xfer_page_cache_end) {
 		usb_pc_dmamap_destroy(pc);
 		pc++;
 	}
 
 	/* free all DMA tags */
 	usb_dma_tag_unsetup(&info->dma_parent_tag);
 #endif
 
 	cv_destroy(&info->cv_drain);
 
 	/*
 	 * free the "memory_base" last, hence the "info" structure is
 	 * contained within the "memory_base"!
 	 */
 	free(info->memory_base, M_USB);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_unsetup - unsetup/free an array of USB transfers
  *
  * NOTE: All USB transfers in progress will get called back passing
  * the error code "USB_ERR_CANCELLED" before this function
  * returns.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_unsetup(struct usb_xfer **pxfer, uint16_t n_setup)
 {
 	struct usb_xfer *xfer;
 	struct usb_xfer_root *info;
 	uint8_t needs_delay = 0;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "usbd_transfer_unsetup can sleep!");
 
 	while (n_setup--) {
 		xfer = pxfer[n_setup];
 
 		if (xfer == NULL)
 			continue;
 
 		info = xfer->xroot;
 
 		USB_XFER_LOCK(xfer);
 		USB_BUS_LOCK(info->bus);
 
 		/*
 		 * HINT: when you start/stop a transfer, it might be a
 		 * good idea to directly use the "pxfer[]" structure:
 		 *
 		 * usbd_transfer_start(sc->pxfer[0]);
 		 * usbd_transfer_stop(sc->pxfer[0]);
 		 *
 		 * That way, if your code has many parts that will not
 		 * stop running under the same lock, in other words
 		 * "xfer_mtx", the usbd_transfer_start and
 		 * usbd_transfer_stop functions will simply return
 		 * when they detect a NULL pointer argument.
 		 *
 		 * To avoid any races we clear the "pxfer[]" pointer
 		 * while holding the private mutex of the driver:
 		 */
 		pxfer[n_setup] = NULL;
 
 		USB_BUS_UNLOCK(info->bus);
 		USB_XFER_UNLOCK(xfer);
 
 		usbd_transfer_drain(xfer);
 
 #if USB_HAVE_BUSDMA
 		if (xfer->flags_int.bdma_enable)
 			needs_delay = 1;
 #endif
 		/*
 		 * NOTE: default endpoint does not have an
 		 * interface, even if endpoint->iface_index == 0
 		 */
 		USB_BUS_LOCK(info->bus);
 		xfer->endpoint->refcount_alloc--;
 		USB_BUS_UNLOCK(info->bus);
 
 		usb_callout_drain(&xfer->timeout_handle);
 
 		USB_BUS_LOCK(info->bus);
 
 		USB_ASSERT(info->setup_refcount != 0, ("Invalid setup "
 		    "reference count\n"));
 
 		info->setup_refcount--;
 
 		if (info->setup_refcount == 0) {
 			usbd_transfer_unsetup_sub(info,
 			    needs_delay);
 		} else {
 			USB_BUS_UNLOCK(info->bus);
 		}
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_control_transfer_init - factored out code
  *
  * In USB Device Mode we have to wait for the SETUP packet which
  * containst the "struct usb_device_request" structure, before we can
  * transfer any data. In USB Host Mode we already have the SETUP
  * packet at the moment the USB transfer is started. This leads us to
  * having to setup the USB transfer at two different places in
  * time. This function just contains factored out control transfer
  * initialisation code, so that we don't duplicate the code.
  *------------------------------------------------------------------------*/
 static void
 usbd_control_transfer_init(struct usb_xfer *xfer)
 {
 	struct usb_device_request req;
 
 	/* copy out the USB request header */
 
 	usbd_copy_out(xfer->frbuffers, 0, &req, sizeof(req));
 
 	/* setup remainder */
 
 	xfer->flags_int.control_rem = UGETW(req.wLength);
 
 	/* copy direction to endpoint variable */
 
 	xfer->endpointno &= ~(UE_DIR_IN | UE_DIR_OUT);
 	xfer->endpointno |=
 	    (req.bmRequestType & UT_READ) ? UE_DIR_IN : UE_DIR_OUT;
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_control_transfer_did_data
  *
  * This function returns non-zero if a control endpoint has
  * transferred the first DATA packet after the SETUP packet.
  * Else it returns zero.
  *------------------------------------------------------------------------*/
 static uint8_t
 usbd_control_transfer_did_data(struct usb_xfer *xfer)
 {
 	struct usb_device_request req;
 
 	/* SETUP packet is not yet sent */
 	if (xfer->flags_int.control_hdr != 0)
 		return (0);
 
 	/* copy out the USB request header */
 	usbd_copy_out(xfer->frbuffers, 0, &req, sizeof(req));
 
 	/* compare remainder to the initial value */
 	return (xfer->flags_int.control_rem != UGETW(req.wLength));
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_setup_ctrl_transfer
  *
  * This function handles initialisation of control transfers. Control
  * transfers are special in that regard that they can both transmit
  * and receive data.
  *
  * Return values:
  *    0: Success
  * Else: Failure
  *------------------------------------------------------------------------*/
 static int
 usbd_setup_ctrl_transfer(struct usb_xfer *xfer)
 {
 	usb_frlength_t len;
 
 	/* Check for control endpoint stall */
 	if (xfer->flags.stall_pipe && xfer->flags_int.control_act) {
 		/* the control transfer is no longer active */
 		xfer->flags_int.control_stall = 1;
 		xfer->flags_int.control_act = 0;
 	} else {
 		/* don't stall control transfer by default */
 		xfer->flags_int.control_stall = 0;
 	}
 
 	/* Check for invalid number of frames */
 	if (xfer->nframes > 2) {
 		/*
 		 * If you need to split a control transfer, you
 		 * have to do one part at a time. Only with
 		 * non-control transfers you can do multiple
 		 * parts a time.
 		 */
 		DPRINTFN(0, "Too many frames: %u\n",
 		    (unsigned int)xfer->nframes);
 		goto error;
 	}
 
 	/*
          * Check if there is a control
          * transfer in progress:
          */
 	if (xfer->flags_int.control_act) {
 
 		if (xfer->flags_int.control_hdr) {
 
 			/* clear send header flag */
 
 			xfer->flags_int.control_hdr = 0;
 
 			/* setup control transfer */
 			if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) {
 				usbd_control_transfer_init(xfer);
 			}
 		}
 		/* get data length */
 
 		len = xfer->sumlen;
 
 	} else {
 
 		/* the size of the SETUP structure is hardcoded ! */
 
 		if (xfer->frlengths[0] != sizeof(struct usb_device_request)) {
 			DPRINTFN(0, "Wrong framelength %u != %zu\n",
 			    xfer->frlengths[0], sizeof(struct
 			    usb_device_request));
 			goto error;
 		}
 		/* check USB mode */
 		if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) {
 
 			/* check number of frames */
 			if (xfer->nframes != 1) {
 				/*
 			         * We need to receive the setup
 			         * message first so that we know the
 			         * data direction!
 			         */
 				DPRINTF("Misconfigured transfer\n");
 				goto error;
 			}
 			/*
 			 * Set a dummy "control_rem" value.  This
 			 * variable will be overwritten later by a
 			 * call to "usbd_control_transfer_init()" !
 			 */
 			xfer->flags_int.control_rem = 0xFFFF;
 		} else {
 
 			/* setup "endpoint" and "control_rem" */
 
 			usbd_control_transfer_init(xfer);
 		}
 
 		/* set transfer-header flag */
 
 		xfer->flags_int.control_hdr = 1;
 
 		/* get data length */
 
 		len = (xfer->sumlen - sizeof(struct usb_device_request));
 	}
 
 	/* update did data flag */
 
 	xfer->flags_int.control_did_data =
 	    usbd_control_transfer_did_data(xfer);
 
 	/* check if there is a length mismatch */
 
 	if (len > xfer->flags_int.control_rem) {
 		DPRINTFN(0, "Length (%d) greater than "
 		    "remaining length (%d)\n", len,
 		    xfer->flags_int.control_rem);
 		goto error;
 	}
 	/* check if we are doing a short transfer */
 
 	if (xfer->flags.force_short_xfer) {
 		xfer->flags_int.control_rem = 0;
 	} else {
 		if ((len != xfer->max_data_length) &&
 		    (len != xfer->flags_int.control_rem) &&
 		    (xfer->nframes != 1)) {
 			DPRINTFN(0, "Short control transfer without "
 			    "force_short_xfer set\n");
 			goto error;
 		}
 		xfer->flags_int.control_rem -= len;
 	}
 
 	/* the status part is executed when "control_act" is 0 */
 
 	if ((xfer->flags_int.control_rem > 0) ||
 	    (xfer->flags.manual_status)) {
 		/* don't execute the STATUS stage yet */
 		xfer->flags_int.control_act = 1;
 
 		/* sanity check */
 		if ((!xfer->flags_int.control_hdr) &&
 		    (xfer->nframes == 1)) {
 			/*
 		         * This is not a valid operation!
 		         */
 			DPRINTFN(0, "Invalid parameter "
 			    "combination\n");
 			goto error;
 		}
 	} else {
 		/* time to execute the STATUS stage */
 		xfer->flags_int.control_act = 0;
 	}
 	return (0);			/* success */
 
 error:
 	return (1);			/* failure */
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_submit - start USB hardware for the given transfer
  *
  * This function should only be called from the USB callback.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_submit(struct usb_xfer *xfer)
 {
 	struct usb_xfer_root *info;
 	struct usb_bus *bus;
 	usb_frcount_t x;
 
 	info = xfer->xroot;
 	bus = info->bus;
 
 	DPRINTF("xfer=%p, endpoint=%p, nframes=%d, dir=%s\n",
 	    xfer, xfer->endpoint, xfer->nframes, USB_GET_DATA_ISREAD(xfer) ?
 	    "read" : "write");
 
 #ifdef USB_DEBUG
 	if (USB_DEBUG_VAR > 0) {
 		USB_BUS_LOCK(bus);
 
 		usb_dump_endpoint(xfer->endpoint);
 
 		USB_BUS_UNLOCK(bus);
 	}
 #endif
 
 	USB_XFER_LOCK_ASSERT(xfer, MA_OWNED);
 	USB_BUS_LOCK_ASSERT(bus, MA_NOTOWNED);
 
 	/* Only open the USB transfer once! */
 	if (!xfer->flags_int.open) {
 		xfer->flags_int.open = 1;
 
 		DPRINTF("open\n");
 
 		USB_BUS_LOCK(bus);
 		(xfer->endpoint->methods->open) (xfer);
 		USB_BUS_UNLOCK(bus);
 	}
 	/* set "transferring" flag */
 	xfer->flags_int.transferring = 1;
 
 #if USB_HAVE_POWERD
 	/* increment power reference */
 	usbd_transfer_power_ref(xfer, 1);
 #endif
 	/*
 	 * Check if the transfer is waiting on a queue, most
 	 * frequently the "done_q":
 	 */
 	if (xfer->wait_queue) {
 		USB_BUS_LOCK(bus);
 		usbd_transfer_dequeue(xfer);
 		USB_BUS_UNLOCK(bus);
 	}
 	/* clear "did_dma_delay" flag */
 	xfer->flags_int.did_dma_delay = 0;
 
 	/* clear "did_close" flag */
 	xfer->flags_int.did_close = 0;
 
 #if USB_HAVE_BUSDMA
 	/* clear "bdma_setup" flag */
 	xfer->flags_int.bdma_setup = 0;
 #endif
 	/* by default we cannot cancel any USB transfer immediately */
 	xfer->flags_int.can_cancel_immed = 0;
 
 	/* clear lengths and frame counts by default */
 	xfer->sumlen = 0;
 	xfer->actlen = 0;
 	xfer->aframes = 0;
 
 	/* clear any previous errors */
 	xfer->error = 0;
 
 	/* Check if the device is still alive */
 	if (info->udev->state < USB_STATE_POWERED) {
 		USB_BUS_LOCK(bus);
 		/*
 		 * Must return cancelled error code else
 		 * device drivers can hang.
 		 */
 		usbd_transfer_done(xfer, USB_ERR_CANCELLED);
 		USB_BUS_UNLOCK(bus);
 		return;
 	}
 
 	/* sanity check */
 	if (xfer->nframes == 0) {
 		if (xfer->flags.stall_pipe) {
 			/*
 			 * Special case - want to stall without transferring
 			 * any data:
 			 */
 			DPRINTF("xfer=%p nframes=0: stall "
 			    "or clear stall!\n", xfer);
 			USB_BUS_LOCK(bus);
 			xfer->flags_int.can_cancel_immed = 1;
 			/* start the transfer */
 			usb_command_wrapper(&xfer->endpoint->
 			    endpoint_q[xfer->stream_id], xfer);
 			USB_BUS_UNLOCK(bus);
 			return;
 		}
 		USB_BUS_LOCK(bus);
 		usbd_transfer_done(xfer, USB_ERR_INVAL);
 		USB_BUS_UNLOCK(bus);
 		return;
 	}
 	/* compute some variables */
 
 	for (x = 0; x != xfer->nframes; x++) {
 		/* make a copy of the frlenghts[] */
 		xfer->frlengths[x + xfer->max_frame_count] = xfer->frlengths[x];
 		/* compute total transfer length */
 		xfer->sumlen += xfer->frlengths[x];
 		if (xfer->sumlen < xfer->frlengths[x]) {
 			/* length wrapped around */
 			USB_BUS_LOCK(bus);
 			usbd_transfer_done(xfer, USB_ERR_INVAL);
 			USB_BUS_UNLOCK(bus);
 			return;
 		}
 	}
 
 	/* clear some internal flags */
 
 	xfer->flags_int.short_xfer_ok = 0;
 	xfer->flags_int.short_frames_ok = 0;
 
 	/* check if this is a control transfer */
 
 	if (xfer->flags_int.control_xfr) {
 
 		if (usbd_setup_ctrl_transfer(xfer)) {
 			USB_BUS_LOCK(bus);
 			usbd_transfer_done(xfer, USB_ERR_STALLED);
 			USB_BUS_UNLOCK(bus);
 			return;
 		}
 	}
 	/*
 	 * Setup filtered version of some transfer flags,
 	 * in case of data read direction
 	 */
 	if (USB_GET_DATA_ISREAD(xfer)) {
 
 		if (xfer->flags.short_frames_ok) {
 			xfer->flags_int.short_xfer_ok = 1;
 			xfer->flags_int.short_frames_ok = 1;
 		} else if (xfer->flags.short_xfer_ok) {
 			xfer->flags_int.short_xfer_ok = 1;
 
 			/* check for control transfer */
 			if (xfer->flags_int.control_xfr) {
 				/*
 				 * 1) Control transfers do not support
 				 * reception of multiple short USB
 				 * frames in host mode and device side
 				 * mode, with exception of:
 				 *
 				 * 2) Due to sometimes buggy device
 				 * side firmware we need to do a
 				 * STATUS stage in case of short
 				 * control transfers in USB host mode.
 				 * The STATUS stage then becomes the
 				 * "alt_next" to the DATA stage.
 				 */
 				xfer->flags_int.short_frames_ok = 1;
 			}
 		}
 	}
 	/*
 	 * Check if BUS-DMA support is enabled and try to load virtual
 	 * buffers into DMA, if any:
 	 */
 #if USB_HAVE_BUSDMA
 	if (xfer->flags_int.bdma_enable) {
 		/* insert the USB transfer last in the BUS-DMA queue */
 		usb_command_wrapper(&xfer->xroot->dma_q, xfer);
 		return;
 	}
 #endif
 	/*
 	 * Enter the USB transfer into the Host Controller or
 	 * Device Controller schedule:
 	 */
 	usbd_pipe_enter(xfer);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_pipe_enter - factored out code
  *------------------------------------------------------------------------*/
 void
 usbd_pipe_enter(struct usb_xfer *xfer)
 {
 	struct usb_endpoint *ep;
 
 	USB_XFER_LOCK_ASSERT(xfer, MA_OWNED);
 
 	USB_BUS_LOCK(xfer->xroot->bus);
 
 	ep = xfer->endpoint;
 
 	DPRINTF("enter\n");
 
 	/* the transfer can now be cancelled */
 	xfer->flags_int.can_cancel_immed = 1;
 
 	/* enter the transfer */
 	(ep->methods->enter) (xfer);
 
 	/* check for transfer error */
 	if (xfer->error) {
 		/* some error has happened */
 		usbd_transfer_done(xfer, 0);
 		USB_BUS_UNLOCK(xfer->xroot->bus);
 		return;
 	}
 
 	/* start the transfer */
 	usb_command_wrapper(&ep->endpoint_q[xfer->stream_id], xfer);
 	USB_BUS_UNLOCK(xfer->xroot->bus);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_start - start an USB transfer
  *
  * NOTE: Calling this function more than one time will only
  *       result in a single transfer start, until the USB transfer
  *       completes.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_start(struct usb_xfer *xfer)
 {
 	if (xfer == NULL) {
 		/* transfer is gone */
 		return;
 	}
 	USB_XFER_LOCK_ASSERT(xfer, MA_OWNED);
 
 	/* mark the USB transfer started */
 
 	if (!xfer->flags_int.started) {
 		/* lock the BUS lock to avoid races updating flags_int */
 		USB_BUS_LOCK(xfer->xroot->bus);
 		xfer->flags_int.started = 1;
 		USB_BUS_UNLOCK(xfer->xroot->bus);
 	}
 	/* check if the USB transfer callback is already transferring */
 
 	if (xfer->flags_int.transferring) {
 		return;
 	}
 	USB_BUS_LOCK(xfer->xroot->bus);
 	/* call the USB transfer callback */
 	usbd_callback_ss_done_defer(xfer);
 	USB_BUS_UNLOCK(xfer->xroot->bus);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_stop - stop an USB transfer
  *
  * NOTE: Calling this function more than one time will only
  *       result in a single transfer stop.
  * NOTE: When this function returns it is not safe to free nor
  *       reuse any DMA buffers. See "usbd_transfer_drain()".
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_stop(struct usb_xfer *xfer)
 {
 	struct usb_endpoint *ep;
 
 	if (xfer == NULL) {
 		/* transfer is gone */
 		return;
 	}
 	USB_XFER_LOCK_ASSERT(xfer, MA_OWNED);
 
 	/* check if the USB transfer was ever opened */
 
 	if (!xfer->flags_int.open) {
 		if (xfer->flags_int.started) {
 			/* nothing to do except clearing the "started" flag */
 			/* lock the BUS lock to avoid races updating flags_int */
 			USB_BUS_LOCK(xfer->xroot->bus);
 			xfer->flags_int.started = 0;
 			USB_BUS_UNLOCK(xfer->xroot->bus);
 		}
 		return;
 	}
 	/* try to stop the current USB transfer */
 
 	USB_BUS_LOCK(xfer->xroot->bus);
 	/* override any previous error */
 	xfer->error = USB_ERR_CANCELLED;
 
 	/*
 	 * Clear "open" and "started" when both private and USB lock
 	 * is locked so that we don't get a race updating "flags_int"
 	 */
 	xfer->flags_int.open = 0;
 	xfer->flags_int.started = 0;
 
 	/*
 	 * Check if we can cancel the USB transfer immediately.
 	 */
 	if (xfer->flags_int.transferring) {
 		if (xfer->flags_int.can_cancel_immed &&
 		    (!xfer->flags_int.did_close)) {
 			DPRINTF("close\n");
 			/*
 			 * The following will lead to an USB_ERR_CANCELLED
 			 * error code being passed to the USB callback.
 			 */
 			(xfer->endpoint->methods->close) (xfer);
 			/* only close once */
 			xfer->flags_int.did_close = 1;
 		} else {
 			/* need to wait for the next done callback */
 		}
 	} else {
 		DPRINTF("close\n");
 
 		/* close here and now */
 		(xfer->endpoint->methods->close) (xfer);
 
 		/*
 		 * Any additional DMA delay is done by
 		 * "usbd_transfer_unsetup()".
 		 */
 
 		/*
 		 * Special case. Check if we need to restart a blocked
 		 * endpoint.
 		 */
 		ep = xfer->endpoint;
 
 		/*
 		 * If the current USB transfer is completing we need
 		 * to start the next one:
 		 */
 		if (ep->endpoint_q[xfer->stream_id].curr == xfer) {
 			usb_command_wrapper(
 			    &ep->endpoint_q[xfer->stream_id], NULL);
 		}
 	}
 
 	USB_BUS_UNLOCK(xfer->xroot->bus);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_pending
  *
  * This function will check if an USB transfer is pending which is a
  * little bit complicated!
  * Return values:
  * 0: Not pending
  * 1: Pending: The USB transfer will receive a callback in the future.
  *------------------------------------------------------------------------*/
 uint8_t
 usbd_transfer_pending(struct usb_xfer *xfer)
 {
 	struct usb_xfer_root *info;
 	struct usb_xfer_queue *pq;
 
 	if (xfer == NULL) {
 		/* transfer is gone */
 		return (0);
 	}
 	USB_XFER_LOCK_ASSERT(xfer, MA_OWNED);
 
 	if (xfer->flags_int.transferring) {
 		/* trivial case */
 		return (1);
 	}
 	USB_BUS_LOCK(xfer->xroot->bus);
 	if (xfer->wait_queue) {
 		/* we are waiting on a queue somewhere */
 		USB_BUS_UNLOCK(xfer->xroot->bus);
 		return (1);
 	}
 	info = xfer->xroot;
 	pq = &info->done_q;
 
 	if (pq->curr == xfer) {
 		/* we are currently scheduled for callback */
 		USB_BUS_UNLOCK(xfer->xroot->bus);
 		return (1);
 	}
 	/* we are not pending */
 	USB_BUS_UNLOCK(xfer->xroot->bus);
 	return (0);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_drain
  *
  * This function will stop the USB transfer and wait for any
  * additional BUS-DMA and HW-DMA operations to complete. Buffers that
  * are loaded into DMA can safely be freed or reused after that this
  * function has returned.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_drain(struct usb_xfer *xfer)
 {
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "usbd_transfer_drain can sleep!");
 
 	if (xfer == NULL) {
 		/* transfer is gone */
 		return;
 	}
 	if (xfer->xroot->xfer_mtx != &Giant) {
 		USB_XFER_LOCK_ASSERT(xfer, MA_NOTOWNED);
 	}
 	USB_XFER_LOCK(xfer);
 
 	usbd_transfer_stop(xfer);
 
 	while (usbd_transfer_pending(xfer) || 
 	    xfer->flags_int.doing_callback) {
 
 		/* 
 		 * It is allowed that the callback can drop its
 		 * transfer mutex. In that case checking only
 		 * "usbd_transfer_pending()" is not enough to tell if
 		 * the USB transfer is fully drained. We also need to
 		 * check the internal "doing_callback" flag.
 		 */
 		xfer->flags_int.draining = 1;
 
 		/*
 		 * Wait until the current outstanding USB
 		 * transfer is complete !
 		 */
 		cv_wait(&xfer->xroot->cv_drain, xfer->xroot->xfer_mtx);
 	}
 	USB_XFER_UNLOCK(xfer);
 }
 
 struct usb_page_cache *
 usbd_xfer_get_frame(struct usb_xfer *xfer, usb_frcount_t frindex)
 {
 	KASSERT(frindex < xfer->max_frame_count, ("frame index overflow"));
 
 	return (&xfer->frbuffers[frindex]);
 }
 
 void *
 usbd_xfer_get_frame_buffer(struct usb_xfer *xfer, usb_frcount_t frindex)
 {
 	struct usb_page_search page_info;
 
 	KASSERT(frindex < xfer->max_frame_count, ("frame index overflow"));
 
 	usbd_get_page(&xfer->frbuffers[frindex], 0, &page_info);
 	return (page_info.buffer);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_xfer_get_fps_shift
  *
  * The following function is only useful for isochronous transfers. It
  * returns how many times the frame execution rate has been shifted
  * down.
  *
  * Return value:
  * Success: 0..3
  * Failure: 0
  *------------------------------------------------------------------------*/
 uint8_t
 usbd_xfer_get_fps_shift(struct usb_xfer *xfer)
 {
 	return (xfer->fps_shift);
 }
 
 usb_frlength_t
 usbd_xfer_frame_len(struct usb_xfer *xfer, usb_frcount_t frindex)
 {
 	KASSERT(frindex < xfer->max_frame_count, ("frame index overflow"));
 
 	return (xfer->frlengths[frindex]);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_xfer_set_frame_data
  *
  * This function sets the pointer of the buffer that should
  * loaded directly into DMA for the given USB frame. Passing "ptr"
  * equal to NULL while the corresponding "frlength" is greater
  * than zero gives undefined results!
  *------------------------------------------------------------------------*/
 void
 usbd_xfer_set_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex,
     void *ptr, usb_frlength_t len)
 {
 	KASSERT(frindex < xfer->max_frame_count, ("frame index overflow"));
 
 	/* set virtual address to load and length */
 	xfer->frbuffers[frindex].buffer = ptr;
 	usbd_xfer_set_frame_len(xfer, frindex, len);
 }
 
 void
 usbd_xfer_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex,
     void **ptr, int *len)
 {
 	KASSERT(frindex < xfer->max_frame_count, ("frame index overflow"));
 
 	if (ptr != NULL)
 		*ptr = xfer->frbuffers[frindex].buffer;
 	if (len != NULL)
 		*len = xfer->frlengths[frindex];
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_xfer_old_frame_length
  *
  * This function returns the framelength of the given frame at the
  * time the transfer was submitted. This function can be used to
  * compute the starting data pointer of the next isochronous frame
  * when an isochronous transfer has completed.
  *------------------------------------------------------------------------*/
 usb_frlength_t
 usbd_xfer_old_frame_length(struct usb_xfer *xfer, usb_frcount_t frindex)
 {
 	KASSERT(frindex < xfer->max_frame_count, ("frame index overflow"));
 
 	return (xfer->frlengths[frindex + xfer->max_frame_count]);
 }
 
 void
 usbd_xfer_status(struct usb_xfer *xfer, int *actlen, int *sumlen, int *aframes,
     int *nframes)
 {
 	if (actlen != NULL)
 		*actlen = xfer->actlen;
 	if (sumlen != NULL)
 		*sumlen = xfer->sumlen;
 	if (aframes != NULL)
 		*aframes = xfer->aframes;
 	if (nframes != NULL)
 		*nframes = xfer->nframes;
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_xfer_set_frame_offset
  *
  * This function sets the frame data buffer offset relative to the beginning
  * of the USB DMA buffer allocated for this USB transfer.
  *------------------------------------------------------------------------*/
 void
 usbd_xfer_set_frame_offset(struct usb_xfer *xfer, usb_frlength_t offset,
     usb_frcount_t frindex)
 {
 	KASSERT(!xfer->flags.ext_buffer, ("Cannot offset data frame "
 	    "when the USB buffer is external\n"));
 	KASSERT(frindex < xfer->max_frame_count, ("frame index overflow"));
 
 	/* set virtual address to load */
 	xfer->frbuffers[frindex].buffer =
 	    USB_ADD_BYTES(xfer->local_buffer, offset);
 }
 
 void
 usbd_xfer_set_interval(struct usb_xfer *xfer, int i)
 {
 	xfer->interval = i;
 }
 
 void
 usbd_xfer_set_timeout(struct usb_xfer *xfer, int t)
 {
 	xfer->timeout = t;
 }
 
 void
 usbd_xfer_set_frames(struct usb_xfer *xfer, usb_frcount_t n)
 {
 	xfer->nframes = n;
 }
 
 usb_frcount_t
 usbd_xfer_max_frames(struct usb_xfer *xfer)
 {
 	return (xfer->max_frame_count);
 }
 
 usb_frlength_t
 usbd_xfer_max_len(struct usb_xfer *xfer)
 {
 	return (xfer->max_data_length);
 }
 
 usb_frlength_t
 usbd_xfer_max_framelen(struct usb_xfer *xfer)
 {
 	return (xfer->max_frame_size);
 }
 
 void
 usbd_xfer_set_frame_len(struct usb_xfer *xfer, usb_frcount_t frindex,
     usb_frlength_t len)
 {
 	KASSERT(frindex < xfer->max_frame_count, ("frame index overflow"));
 
 	xfer->frlengths[frindex] = len;
 }
 
 /*------------------------------------------------------------------------*
  *	usb_callback_proc - factored out code
  *
  * This function performs USB callbacks.
  *------------------------------------------------------------------------*/
 static void
 usb_callback_proc(struct usb_proc_msg *_pm)
 {
 	struct usb_done_msg *pm = (void *)_pm;
 	struct usb_xfer_root *info = pm->xroot;
 
 	/* Change locking order */
 	USB_BUS_UNLOCK(info->bus);
 
 	/*
 	 * We exploit the fact that the mutex is the same for all
 	 * callbacks that will be called from this thread:
 	 */
 	USB_MTX_LOCK(info->xfer_mtx);
 	USB_BUS_LOCK(info->bus);
 
 	/* Continue where we lost track */
 	usb_command_wrapper(&info->done_q,
 	    info->done_q.curr);
 
 	USB_MTX_UNLOCK(info->xfer_mtx);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_callback_ss_done_defer
  *
  * This function will defer the start, stop and done callback to the
  * correct thread.
  *------------------------------------------------------------------------*/
 static void
 usbd_callback_ss_done_defer(struct usb_xfer *xfer)
 {
 	struct usb_xfer_root *info = xfer->xroot;
 	struct usb_xfer_queue *pq = &info->done_q;
 
 	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
 
 	if (pq->curr != xfer) {
 		usbd_transfer_enqueue(pq, xfer);
 	}
 	if (!pq->recurse_1) {
 
 		/*
 	         * We have to postpone the callback due to the fact we
 	         * will have a Lock Order Reversal, LOR, if we try to
 	         * proceed !
 	         */
 		(void) usb_proc_msignal(info->done_p,
 		    &info->done_m[0], &info->done_m[1]);
 	} else {
 		/* clear second recurse flag */
 		pq->recurse_2 = 0;
 	}
 	return;
 
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_callback_wrapper
  *
  * This is a wrapper for USB callbacks. This wrapper does some
  * auto-magic things like figuring out if we can call the callback
  * directly from the current context or if we need to wakeup the
  * interrupt process.
  *------------------------------------------------------------------------*/
 static void
 usbd_callback_wrapper(struct usb_xfer_queue *pq)
 {
 	struct usb_xfer *xfer = pq->curr;
 	struct usb_xfer_root *info = xfer->xroot;
 
 	USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED);
 	if ((pq->recurse_3 != 0 || mtx_owned(info->xfer_mtx) == 0) &&
 	    USB_IN_POLLING_MODE_FUNC() == 0) {
 		/*
 	       	 * Cases that end up here:
 		 *
 		 * 5) HW interrupt done callback or other source.
 		 * 6) HW completed transfer during callback
 		 */
 		DPRINTFN(3, "case 5 and 6\n");
 
 		/*
 	         * We have to postpone the callback due to the fact we
 	         * will have a Lock Order Reversal, LOR, if we try to
 	         * proceed!
 		 *
 		 * Postponing the callback also ensures that other USB
 		 * transfer queues get a chance.
 	         */
 		(void) usb_proc_msignal(info->done_p,
 		    &info->done_m[0], &info->done_m[1]);
 		return;
 	}
 	/*
 	 * Cases that end up here:
 	 *
 	 * 1) We are starting a transfer
 	 * 2) We are prematurely calling back a transfer
 	 * 3) We are stopping a transfer
 	 * 4) We are doing an ordinary callback
 	 */
 	DPRINTFN(3, "case 1-4\n");
 	/* get next USB transfer in the queue */
 	info->done_q.curr = NULL;
 
 	/* set flag in case of drain */
 	xfer->flags_int.doing_callback = 1;
 
 	USB_BUS_UNLOCK(info->bus);
 	USB_BUS_LOCK_ASSERT(info->bus, MA_NOTOWNED);
 
 	/* set correct USB state for callback */
 	if (!xfer->flags_int.transferring) {
 		xfer->usb_state = USB_ST_SETUP;
 		if (!xfer->flags_int.started) {
 			/* we got stopped before we even got started */
 			USB_BUS_LOCK(info->bus);
 			goto done;
 		}
 	} else {
 
 		if (usbd_callback_wrapper_sub(xfer)) {
 			/* the callback has been deferred */
 			USB_BUS_LOCK(info->bus);
 			goto done;
 		}
 #if USB_HAVE_POWERD
 		/* decrement power reference */
 		usbd_transfer_power_ref(xfer, -1);
 #endif
 		xfer->flags_int.transferring = 0;
 
 		if (xfer->error) {
 			xfer->usb_state = USB_ST_ERROR;
 		} else {
 			/* set transferred state */
 			xfer->usb_state = USB_ST_TRANSFERRED;
 #if USB_HAVE_BUSDMA
 			/* sync DMA memory, if any */
 			if (xfer->flags_int.bdma_enable &&
 			    (!xfer->flags_int.bdma_no_post_sync)) {
 				usb_bdma_post_sync(xfer);
 			}
 #endif
 		}
 	}
 
 #if USB_HAVE_PF
 	if (xfer->usb_state != USB_ST_SETUP) {
 		USB_BUS_LOCK(info->bus);
 		usbpf_xfertap(xfer, USBPF_XFERTAP_DONE);
 		USB_BUS_UNLOCK(info->bus);
 	}
 #endif
 	/* call processing routine */
 	(xfer->callback) (xfer, xfer->error);
 
 	/* pickup the USB mutex again */
 	USB_BUS_LOCK(info->bus);
 
 	/*
 	 * Check if we got started after that we got cancelled, but
 	 * before we managed to do the callback.
 	 */
 	if ((!xfer->flags_int.open) &&
 	    (xfer->flags_int.started) &&
 	    (xfer->usb_state == USB_ST_ERROR)) {
 		/* clear flag in case of drain */
 		xfer->flags_int.doing_callback = 0;
 		/* try to loop, but not recursivly */
 		usb_command_wrapper(&info->done_q, xfer);
 		return;
 	}
 
 done:
 	/* clear flag in case of drain */
 	xfer->flags_int.doing_callback = 0;
 
 	/*
 	 * Check if we are draining.
 	 */
 	if (xfer->flags_int.draining &&
 	    (!xfer->flags_int.transferring)) {
 		/* "usbd_transfer_drain()" is waiting for end of transfer */
 		xfer->flags_int.draining = 0;
 		cv_broadcast(&info->cv_drain);
 	}
 
 	/* do the next callback, if any */
 	usb_command_wrapper(&info->done_q,
 	    info->done_q.curr);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_dma_delay_done_cb
  *
  * This function is called when the DMA delay has been exectuded, and
  * will make sure that the callback is called to complete the USB
  * transfer. This code path is usually only used when there is an USB
  * error like USB_ERR_CANCELLED.
  *------------------------------------------------------------------------*/
 void
 usb_dma_delay_done_cb(struct usb_xfer *xfer)
 {
 	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
 
 	DPRINTFN(3, "Completed %p\n", xfer);
 
 	/* queue callback for execution, again */
 	usbd_transfer_done(xfer, 0);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_dequeue
  *
  *  - This function is used to remove an USB transfer from a USB
  *  transfer queue.
  *
  *  - This function can be called multiple times in a row.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_dequeue(struct usb_xfer *xfer)
 {
 	struct usb_xfer_queue *pq;
 
 	pq = xfer->wait_queue;
 	if (pq) {
 		TAILQ_REMOVE(&pq->head, xfer, wait_entry);
 		xfer->wait_queue = NULL;
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_enqueue
  *
  *  - This function is used to insert an USB transfer into a USB *
  *  transfer queue.
  *
  *  - This function can be called multiple times in a row.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_enqueue(struct usb_xfer_queue *pq, struct usb_xfer *xfer)
 {
 	/*
 	 * Insert the USB transfer into the queue, if it is not
 	 * already on a USB transfer queue:
 	 */
 	if (xfer->wait_queue == NULL) {
 		xfer->wait_queue = pq;
 		TAILQ_INSERT_TAIL(&pq->head, xfer, wait_entry);
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_done
  *
  *  - This function is used to remove an USB transfer from the busdma,
  *  pipe or interrupt queue.
  *
  *  - This function is used to queue the USB transfer on the done
  *  queue.
  *
  *  - This function is used to stop any USB transfer timeouts.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_done(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct usb_xfer_root *info = xfer->xroot;
 
 	USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED);
 
 	DPRINTF("err=%s\n", usbd_errstr(error));
 
 	/*
 	 * If we are not transferring then just return.
 	 * This can happen during transfer cancel.
 	 */
 	if (!xfer->flags_int.transferring) {
 		DPRINTF("not transferring\n");
 		/* end of control transfer, if any */
 		xfer->flags_int.control_act = 0;
 		return;
 	}
 	/* only set transfer error, if not already set */
 	if (xfer->error == USB_ERR_NORMAL_COMPLETION)
 		xfer->error = error;
 
 	/* stop any callouts */
 	usb_callout_stop(&xfer->timeout_handle);
 
 	/*
 	 * If we are waiting on a queue, just remove the USB transfer
 	 * from the queue, if any. We should have the required locks
 	 * locked to do the remove when this function is called.
 	 */
 	usbd_transfer_dequeue(xfer);
 
 #if USB_HAVE_BUSDMA
 	if (mtx_owned(info->xfer_mtx)) {
 		struct usb_xfer_queue *pq;
 
 		/*
 		 * If the private USB lock is not locked, then we assume
 		 * that the BUS-DMA load stage has been passed:
 		 */
 		pq = &info->dma_q;
 
 		if (pq->curr == xfer) {
 			/* start the next BUS-DMA load, if any */
 			usb_command_wrapper(pq, NULL);
 		}
 	}
 #endif
 	/* keep some statistics */
 	if (xfer->error) {
 		info->bus->stats_err.uds_requests
 		    [xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE]++;
 	} else {
 		info->bus->stats_ok.uds_requests
 		    [xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE]++;
 	}
 
 	/* call the USB transfer callback */
 	usbd_callback_ss_done_defer(xfer);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_start_cb
  *
  * This function is called to start the USB transfer when
  * "xfer->interval" is greater than zero, and and the endpoint type is
  * BULK or CONTROL.
  *------------------------------------------------------------------------*/
 static void
 usbd_transfer_start_cb(void *arg)
 {
 	struct usb_xfer *xfer = arg;
 	struct usb_endpoint *ep = xfer->endpoint;
 
 	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
 
 	DPRINTF("start\n");
 
 #if USB_HAVE_PF
 	usbpf_xfertap(xfer, USBPF_XFERTAP_SUBMIT);
 #endif
 
 	/* the transfer can now be cancelled */
 	xfer->flags_int.can_cancel_immed = 1;
 
 	/* start USB transfer, if no error */
 	if (xfer->error == 0)
 		(ep->methods->start) (xfer);
 
 	/* check for transfer error */
 	if (xfer->error) {
 		/* some error has happened */
 		usbd_transfer_done(xfer, 0);
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_xfer_set_stall
  *
  * This function is used to set the stall flag outside the
  * callback. This function is NULL safe.
  *------------------------------------------------------------------------*/
 void
 usbd_xfer_set_stall(struct usb_xfer *xfer)
 {
 	if (xfer == NULL) {
 		/* tearing down */
 		return;
 	}
 	USB_XFER_LOCK_ASSERT(xfer, MA_OWNED);
 
 	/* avoid any races by locking the USB mutex */
 	USB_BUS_LOCK(xfer->xroot->bus);
 	xfer->flags.stall_pipe = 1;
 	USB_BUS_UNLOCK(xfer->xroot->bus);
 }
 
 int
 usbd_xfer_is_stalled(struct usb_xfer *xfer)
 {
 	return (xfer->endpoint->is_stalled);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_clear_stall
  *
  * This function is used to clear the stall flag outside the
  * callback. This function is NULL safe.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_clear_stall(struct usb_xfer *xfer)
 {
 	if (xfer == NULL) {
 		/* tearing down */
 		return;
 	}
 	USB_XFER_LOCK_ASSERT(xfer, MA_OWNED);
 
 	/* avoid any races by locking the USB mutex */
 	USB_BUS_LOCK(xfer->xroot->bus);
 
 	xfer->flags.stall_pipe = 0;
 
 	USB_BUS_UNLOCK(xfer->xroot->bus);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_pipe_start
  *
  * This function is used to add an USB transfer to the pipe transfer list.
  *------------------------------------------------------------------------*/
 void
 usbd_pipe_start(struct usb_xfer_queue *pq)
 {
 	struct usb_endpoint *ep;
 	struct usb_xfer *xfer;
 	uint8_t type;
 
 	xfer = pq->curr;
 	ep = xfer->endpoint;
 
 	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
 
 	/*
 	 * If the endpoint is already stalled we do nothing !
 	 */
 	if (ep->is_stalled) {
 		return;
 	}
 	/*
 	 * Check if we are supposed to stall the endpoint:
 	 */
 	if (xfer->flags.stall_pipe) {
 		struct usb_device *udev;
 		struct usb_xfer_root *info;
 
 		/* clear stall command */
 		xfer->flags.stall_pipe = 0;
 
 		/* get pointer to USB device */
 		info = xfer->xroot;
 		udev = info->udev;
 
 		/*
 		 * Only stall BULK and INTERRUPT endpoints.
 		 */
 		type = (ep->edesc->bmAttributes & UE_XFERTYPE);
 		if ((type == UE_BULK) ||
 		    (type == UE_INTERRUPT)) {
 			uint8_t did_stall;
 
 			did_stall = 1;
 
 			if (udev->flags.usb_mode == USB_MODE_DEVICE) {
 				(udev->bus->methods->set_stall) (
 				    udev, ep, &did_stall);
 			} else if (udev->ctrl_xfer[1]) {
 				info = udev->ctrl_xfer[1]->xroot;
 				usb_proc_msignal(
 				    USB_BUS_CS_PROC(info->bus),
 				    &udev->cs_msg[0], &udev->cs_msg[1]);
 			} else {
 				/* should not happen */
 				DPRINTFN(0, "No stall handler\n");
 			}
 			/*
 			 * Check if we should stall. Some USB hardware
 			 * handles set- and clear-stall in hardware.
 			 */
 			if (did_stall) {
 				/*
 				 * The transfer will be continued when
 				 * the clear-stall control endpoint
 				 * message is received.
 				 */
 				ep->is_stalled = 1;
 				return;
 			}
 		} else if (type == UE_ISOCHRONOUS) {
 
 			/* 
 			 * Make sure any FIFO overflow or other FIFO
 			 * error conditions go away by resetting the
 			 * endpoint FIFO through the clear stall
 			 * method.
 			 */
 			if (udev->flags.usb_mode == USB_MODE_DEVICE) {
 				(udev->bus->methods->clear_stall) (udev, ep);
 			}
 		}
 	}
 	/* Set or clear stall complete - special case */
 	if (xfer->nframes == 0) {
 		/* we are complete */
 		xfer->aframes = 0;
 		usbd_transfer_done(xfer, 0);
 		return;
 	}
 	/*
 	 * Handled cases:
 	 *
 	 * 1) Start the first transfer queued.
 	 *
 	 * 2) Re-start the current USB transfer.
 	 */
 	/*
 	 * Check if there should be any
 	 * pre transfer start delay:
 	 */
 	if (xfer->interval > 0) {
 		type = (ep->edesc->bmAttributes & UE_XFERTYPE);
 		if ((type == UE_BULK) ||
 		    (type == UE_CONTROL)) {
 			usbd_transfer_timeout_ms(xfer,
 			    &usbd_transfer_start_cb,
 			    xfer->interval);
 			return;
 		}
 	}
 	DPRINTF("start\n");
 
 #if USB_HAVE_PF
 	usbpf_xfertap(xfer, USBPF_XFERTAP_SUBMIT);
 #endif
 	/* the transfer can now be cancelled */
 	xfer->flags_int.can_cancel_immed = 1;
 
 	/* start USB transfer, if no error */
 	if (xfer->error == 0)
 		(ep->methods->start) (xfer);
 
 	/* check for transfer error */
 	if (xfer->error) {
 		/* some error has happened */
 		usbd_transfer_done(xfer, 0);
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_timeout_ms
  *
  * This function is used to setup a timeout on the given USB
  * transfer. If the timeout has been deferred the callback given by
  * "cb" will get called after "ms" milliseconds.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_timeout_ms(struct usb_xfer *xfer,
     void (*cb) (void *arg), usb_timeout_t ms)
 {
 	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
 
 	/* defer delay */
 	usb_callout_reset(&xfer->timeout_handle,
 	    USB_MS_TO_TICKS(ms) + USB_CALLOUT_ZERO_TICKS, cb, xfer);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_callback_wrapper_sub
  *
  *  - This function will update variables in an USB transfer after
  *  that the USB transfer is complete.
  *
  *  - This function is used to start the next USB transfer on the
  *  ep transfer queue, if any.
  *
  * NOTE: In some special cases the USB transfer will not be removed from
  * the pipe queue, but remain first. To enforce USB transfer removal call
  * this function passing the error code "USB_ERR_CANCELLED".
  *
  * Return values:
  * 0: Success.
  * Else: The callback has been deferred.
  *------------------------------------------------------------------------*/
 static uint8_t
 usbd_callback_wrapper_sub(struct usb_xfer *xfer)
 {
 	struct usb_endpoint *ep;
 	struct usb_bus *bus;
 	usb_frcount_t x;
 
 	bus = xfer->xroot->bus;
 
 	if ((!xfer->flags_int.open) &&
 	    (!xfer->flags_int.did_close)) {
 		DPRINTF("close\n");
 		USB_BUS_LOCK(bus);
 		(xfer->endpoint->methods->close) (xfer);
 		USB_BUS_UNLOCK(bus);
 		/* only close once */
 		xfer->flags_int.did_close = 1;
 		return (1);		/* wait for new callback */
 	}
 	/*
 	 * If we have a non-hardware induced error we
 	 * need to do the DMA delay!
 	 */
 	if (xfer->error != 0 && !xfer->flags_int.did_dma_delay &&
 	    (xfer->error == USB_ERR_CANCELLED ||
 	    xfer->error == USB_ERR_TIMEOUT ||
 	    bus->methods->start_dma_delay != NULL)) {
 
 		usb_timeout_t temp;
 
 		/* only delay once */
 		xfer->flags_int.did_dma_delay = 1;
 
 		/* we can not cancel this delay */
 		xfer->flags_int.can_cancel_immed = 0;
 
 		temp = usbd_get_dma_delay(xfer->xroot->udev);
 
 		DPRINTFN(3, "DMA delay, %u ms, "
 		    "on %p\n", temp, xfer);
 
 		if (temp != 0) {
 			USB_BUS_LOCK(bus);
 			/*
 			 * Some hardware solutions have dedicated
 			 * events when it is safe to free DMA'ed
 			 * memory. For the other hardware platforms we
 			 * use a static delay.
 			 */
 			if (bus->methods->start_dma_delay != NULL) {
 				(bus->methods->start_dma_delay) (xfer);
 			} else {
 				usbd_transfer_timeout_ms(xfer,
 				    (void (*)(void *))&usb_dma_delay_done_cb,
 				    temp);
 			}
 			USB_BUS_UNLOCK(bus);
 			return (1);	/* wait for new callback */
 		}
 	}
 	/* check actual number of frames */
 	if (xfer->aframes > xfer->nframes) {
 		if (xfer->error == 0) {
 			panic("%s: actual number of frames, %d, is "
 			    "greater than initial number of frames, %d\n",
 			    __FUNCTION__, xfer->aframes, xfer->nframes);
 		} else {
 			/* just set some valid value */
 			xfer->aframes = xfer->nframes;
 		}
 	}
 	/* compute actual length */
 	xfer->actlen = 0;
 
 	for (x = 0; x != xfer->aframes; x++) {
 		xfer->actlen += xfer->frlengths[x];
 	}
 
 	/*
 	 * Frames that were not transferred get zero actual length in
 	 * case the USB device driver does not check the actual number
 	 * of frames transferred, "xfer->aframes":
 	 */
 	for (; x < xfer->nframes; x++) {
 		usbd_xfer_set_frame_len(xfer, x, 0);
 	}
 
 	/* check actual length */
 	if (xfer->actlen > xfer->sumlen) {
 		if (xfer->error == 0) {
 			panic("%s: actual length, %d, is greater than "
 			    "initial length, %d\n",
 			    __FUNCTION__, xfer->actlen, xfer->sumlen);
 		} else {
 			/* just set some valid value */
 			xfer->actlen = xfer->sumlen;
 		}
 	}
 	DPRINTFN(1, "xfer=%p endpoint=%p sts=%d alen=%d, slen=%d, afrm=%d, nfrm=%d\n",
 	    xfer, xfer->endpoint, xfer->error, xfer->actlen, xfer->sumlen,
 	    xfer->aframes, xfer->nframes);
 
 	if (xfer->error) {
 		/* end of control transfer, if any */
 		xfer->flags_int.control_act = 0;
 
 #if USB_HAVE_TT_SUPPORT
 		switch (xfer->error) {
 		case USB_ERR_NORMAL_COMPLETION:
 		case USB_ERR_SHORT_XFER:
 		case USB_ERR_STALLED:
 		case USB_ERR_CANCELLED:
 			/* nothing to do */
 			break;
 		default:
 			/* try to reset the TT, if any */
 			USB_BUS_LOCK(bus);
 			uhub_tt_buffer_reset_async_locked(xfer->xroot->udev, xfer->endpoint);
 			USB_BUS_UNLOCK(bus);
 			break;
 		}
 #endif
 		/* check if we should block the execution queue */
 		if ((xfer->error != USB_ERR_CANCELLED) &&
 		    (xfer->flags.pipe_bof)) {
 			DPRINTFN(2, "xfer=%p: Block On Failure "
 			    "on endpoint=%p\n", xfer, xfer->endpoint);
 			goto done;
 		}
 	} else {
 		/* check for short transfers */
 		if (xfer->actlen < xfer->sumlen) {
 
 			/* end of control transfer, if any */
 			xfer->flags_int.control_act = 0;
 
 			if (!xfer->flags_int.short_xfer_ok) {
 				xfer->error = USB_ERR_SHORT_XFER;
 				if (xfer->flags.pipe_bof) {
 					DPRINTFN(2, "xfer=%p: Block On Failure on "
 					    "Short Transfer on endpoint %p.\n",
 					    xfer, xfer->endpoint);
 					goto done;
 				}
 			}
 		} else {
 			/*
 			 * Check if we are in the middle of a
 			 * control transfer:
 			 */
 			if (xfer->flags_int.control_act) {
 				DPRINTFN(5, "xfer=%p: Control transfer "
 				    "active on endpoint=%p\n", xfer, xfer->endpoint);
 				goto done;
 			}
 		}
 	}
 
 	ep = xfer->endpoint;
 
 	/*
 	 * If the current USB transfer is completing we need to start the
 	 * next one:
 	 */
 	USB_BUS_LOCK(bus);
 	if (ep->endpoint_q[xfer->stream_id].curr == xfer) {
 		usb_command_wrapper(&ep->endpoint_q[xfer->stream_id], NULL);
 
 		if (ep->endpoint_q[xfer->stream_id].curr != NULL ||
 		    TAILQ_FIRST(&ep->endpoint_q[xfer->stream_id].head) != NULL) {
 			/* there is another USB transfer waiting */
 		} else {
 			/* this is the last USB transfer */
 			/* clear isochronous sync flag */
 			xfer->endpoint->is_synced = 0;
 		}
 	}
 	USB_BUS_UNLOCK(bus);
 done:
 	return (0);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_command_wrapper
  *
  * This function is used to execute commands non-recursivly on an USB
  * transfer.
  *------------------------------------------------------------------------*/
 void
 usb_command_wrapper(struct usb_xfer_queue *pq, struct usb_xfer *xfer)
 {
 	if (xfer) {
 		/*
 		 * If the transfer is not already processing,
 		 * queue it!
 		 */
 		if (pq->curr != xfer) {
 			usbd_transfer_enqueue(pq, xfer);
 			if (pq->curr != NULL) {
 				/* something is already processing */
 				DPRINTFN(6, "busy %p\n", pq->curr);
 				return;
 			}
 		}
 	} else {
 		/* Get next element in queue */
 		pq->curr = NULL;
 	}
 
 	if (!pq->recurse_1) {
 
 		/* clear third recurse flag */
 		pq->recurse_3 = 0;
 
 		do {
 			/* set two first recurse flags */
 			pq->recurse_1 = 1;
 			pq->recurse_2 = 1;
 
 			if (pq->curr == NULL) {
 				xfer = TAILQ_FIRST(&pq->head);
 				if (xfer) {
 					TAILQ_REMOVE(&pq->head, xfer,
 					    wait_entry);
 					xfer->wait_queue = NULL;
 					pq->curr = xfer;
 				} else {
 					break;
 				}
 			}
 			DPRINTFN(6, "cb %p (enter)\n", pq->curr);
 			(pq->command) (pq);
 			DPRINTFN(6, "cb %p (leave)\n", pq->curr);
 
 			/*
 			 * Set third recurse flag to indicate
 			 * recursion happened:
 			 */
 			pq->recurse_3 = 1;
 
 		} while (!pq->recurse_2);
 
 		/* clear first recurse flag */
 		pq->recurse_1 = 0;
 
 	} else {
 		/* clear second recurse flag */
 		pq->recurse_2 = 0;
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_ctrl_transfer_setup
  *
  * This function is used to setup the default USB control endpoint
  * transfer.
  *------------------------------------------------------------------------*/
 void
 usbd_ctrl_transfer_setup(struct usb_device *udev)
 {
 	struct usb_xfer *xfer;
 	uint8_t no_resetup;
 	uint8_t iface_index;
 
 	/* check for root HUB */
 	if (udev->parent_hub == NULL)
 		return;
 repeat:
 
 	xfer = udev->ctrl_xfer[0];
 	if (xfer) {
 		USB_XFER_LOCK(xfer);
 		no_resetup =
 		    ((xfer->address == udev->address) &&
 		    (udev->ctrl_ep_desc.wMaxPacketSize[0] ==
 		    udev->ddesc.bMaxPacketSize));
 		if (udev->flags.usb_mode == USB_MODE_DEVICE) {
 			if (no_resetup) {
 				/*
 				 * NOTE: checking "xfer->address" and
 				 * starting the USB transfer must be
 				 * atomic!
 				 */
 				usbd_transfer_start(xfer);
 			}
 		}
 		USB_XFER_UNLOCK(xfer);
 	} else {
 		no_resetup = 0;
 	}
 
 	if (no_resetup) {
 		/*
 	         * All parameters are exactly the same like before.
 	         * Just return.
 	         */
 		return;
 	}
 	/*
 	 * Update wMaxPacketSize for the default control endpoint:
 	 */
 	udev->ctrl_ep_desc.wMaxPacketSize[0] =
 	    udev->ddesc.bMaxPacketSize;
 
 	/*
 	 * Unsetup any existing USB transfer:
 	 */
 	usbd_transfer_unsetup(udev->ctrl_xfer, USB_CTRL_XFER_MAX);
 
 	/*
 	 * Reset clear stall error counter.
 	 */
 	udev->clear_stall_errors = 0;
 
 	/*
 	 * Try to setup a new USB transfer for the
 	 * default control endpoint:
 	 */
 	iface_index = 0;
 	if (usbd_transfer_setup(udev, &iface_index,
-	    udev->ctrl_xfer, usb_control_ep_cfg, USB_CTRL_XFER_MAX, NULL,
+	    udev->ctrl_xfer, udev->bus->control_ep_quirk ?
+	    usb_control_ep_quirk_cfg : usb_control_ep_cfg, USB_CTRL_XFER_MAX, NULL,
 	    &udev->device_mtx)) {
 		DPRINTFN(0, "could not setup default "
 		    "USB transfer\n");
 	} else {
 		goto repeat;
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_clear_data_toggle - factored out code
  *
  * NOTE: the intention of this function is not to reset the hardware
  * data toggle.
  *------------------------------------------------------------------------*/
 void
 usbd_clear_stall_locked(struct usb_device *udev, struct usb_endpoint *ep)
 {
 	USB_BUS_LOCK_ASSERT(udev->bus, MA_OWNED);
 
 	/* check that we have a valid case */
 	if (udev->flags.usb_mode == USB_MODE_HOST &&
 	    udev->parent_hub != NULL &&
 	    udev->bus->methods->clear_stall != NULL &&
 	    ep->methods != NULL) {
 		(udev->bus->methods->clear_stall) (udev, ep);
 	}
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_clear_data_toggle - factored out code
  *
  * NOTE: the intention of this function is not to reset the hardware
  * data toggle on the USB device side.
  *------------------------------------------------------------------------*/
 void
 usbd_clear_data_toggle(struct usb_device *udev, struct usb_endpoint *ep)
 {
 	DPRINTFN(5, "udev=%p endpoint=%p\n", udev, ep);
 
 	USB_BUS_LOCK(udev->bus);
 	ep->toggle_next = 0;
 	/* some hardware needs a callback to clear the data toggle */
 	usbd_clear_stall_locked(udev, ep);
 	USB_BUS_UNLOCK(udev->bus);
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_clear_stall_callback - factored out clear stall callback
  *
  * Input parameters:
  *  xfer1: Clear Stall Control Transfer
  *  xfer2: Stalled USB Transfer
  *
  * This function is NULL safe.
  *
  * Return values:
  *   0: In progress
  *   Else: Finished
  *
  * Clear stall config example:
  *
  * static const struct usb_config my_clearstall =  {
  *	.type = UE_CONTROL,
  *	.endpoint = 0,
  *	.direction = UE_DIR_ANY,
  *	.interval = 50, //50 milliseconds
  *	.bufsize = sizeof(struct usb_device_request),
  *	.timeout = 1000, //1.000 seconds
  *	.callback = &my_clear_stall_callback, // **
  *	.usb_mode = USB_MODE_HOST,
  * };
  *
  * ** "my_clear_stall_callback" calls "usbd_clear_stall_callback"
  * passing the correct parameters.
  *------------------------------------------------------------------------*/
 uint8_t
 usbd_clear_stall_callback(struct usb_xfer *xfer1,
     struct usb_xfer *xfer2)
 {
 	struct usb_device_request req;
 
 	if (xfer2 == NULL) {
 		/* looks like we are tearing down */
 		DPRINTF("NULL input parameter\n");
 		return (0);
 	}
 	USB_XFER_LOCK_ASSERT(xfer1, MA_OWNED);
 	USB_XFER_LOCK_ASSERT(xfer2, MA_OWNED);
 
 	switch (USB_GET_STATE(xfer1)) {
 	case USB_ST_SETUP:
 
 		/*
 		 * pre-clear the data toggle to DATA0 ("umass.c" and
 		 * "ata-usb.c" depends on this)
 		 */
 
 		usbd_clear_data_toggle(xfer2->xroot->udev, xfer2->endpoint);
 
 		/* setup a clear-stall packet */
 
 		req.bmRequestType = UT_WRITE_ENDPOINT;
 		req.bRequest = UR_CLEAR_FEATURE;
 		USETW(req.wValue, UF_ENDPOINT_HALT);
 		req.wIndex[0] = xfer2->endpoint->edesc->bEndpointAddress;
 		req.wIndex[1] = 0;
 		USETW(req.wLength, 0);
 
 		/*
 		 * "usbd_transfer_setup_sub()" will ensure that
 		 * we have sufficient room in the buffer for
 		 * the request structure!
 		 */
 
 		/* copy in the transfer */
 
 		usbd_copy_in(xfer1->frbuffers, 0, &req, sizeof(req));
 
 		/* set length */
 		xfer1->frlengths[0] = sizeof(req);
 		xfer1->nframes = 1;
 
 		usbd_transfer_submit(xfer1);
 		return (0);
 
 	case USB_ST_TRANSFERRED:
 		break;
 
 	default:			/* Error */
 		if (xfer1->error == USB_ERR_CANCELLED) {
 			return (0);
 		}
 		break;
 	}
 	return (1);			/* Clear Stall Finished */
 }
 
 /*------------------------------------------------------------------------*
  *	usbd_transfer_poll
  *
  * The following function gets called from the USB keyboard driver and
  * UMASS when the system has paniced.
  *
  * NOTE: It is currently not possible to resume normal operation on
  * the USB controller which has been polled, due to clearing of the
  * "up_dsleep" and "up_msleep" flags.
  *------------------------------------------------------------------------*/
 void
 usbd_transfer_poll(struct usb_xfer **ppxfer, uint16_t max)
 {
 	struct usb_xfer *xfer;
 	struct usb_xfer_root *xroot;
 	struct usb_device *udev;
 	struct usb_proc_msg *pm;
 	struct usb_bus *bus;
 	uint16_t n;
 	uint16_t drop_bus_spin;
 	uint16_t drop_bus;
 	uint16_t drop_xfer;
 
 	for (n = 0; n != max; n++) {
 		/* Extra checks to avoid panic */
 		xfer = ppxfer[n];
 		if (xfer == NULL)
 			continue;	/* no USB transfer */
 		xroot = xfer->xroot;
 		if (xroot == NULL)
 			continue;	/* no USB root */
 		udev = xroot->udev;
 		if (udev == NULL)
 			continue;	/* no USB device */
 		bus = udev->bus;
 		if (bus == NULL)
 			continue;	/* no BUS structure */
 		if (bus->methods == NULL)
 			continue;	/* no BUS methods */
 		if (bus->methods->xfer_poll == NULL)
 			continue;	/* no poll method */
 
 		drop_bus_spin = 0;
 		drop_bus = 0;
 		drop_xfer = 0;
 
 		if (USB_IN_POLLING_MODE_FUNC() == 0) {
 			/* make sure that the BUS spin mutex is not locked */
 			while (mtx_owned(&bus->bus_spin_lock)) {
 				mtx_unlock_spin(&bus->bus_spin_lock);
 				drop_bus_spin++;
 			}
 		
 			/* make sure that the BUS mutex is not locked */
 			while (mtx_owned(&bus->bus_mtx)) {
 				mtx_unlock(&bus->bus_mtx);
 				drop_bus++;
 			}
 
 			/* make sure that the transfer mutex is not locked */
 			while (mtx_owned(xroot->xfer_mtx)) {
 				mtx_unlock(xroot->xfer_mtx);
 				drop_xfer++;
 			}
 		}
 
 		/* Make sure cv_signal() and cv_broadcast() is not called */
 		USB_BUS_CONTROL_XFER_PROC(bus)->up_msleep = 0;
 		USB_BUS_EXPLORE_PROC(bus)->up_msleep = 0;
 		USB_BUS_GIANT_PROC(bus)->up_msleep = 0;
 		USB_BUS_NON_GIANT_ISOC_PROC(bus)->up_msleep = 0;
 		USB_BUS_NON_GIANT_BULK_PROC(bus)->up_msleep = 0;
 
 		/* poll USB hardware */
 		(bus->methods->xfer_poll) (bus);
 
 		USB_BUS_LOCK(xroot->bus);
 
 		/* check for clear stall */
 		if (udev->ctrl_xfer[1] != NULL) {
 
 			/* poll clear stall start */
 			pm = &udev->cs_msg[0].hdr;
 			(pm->pm_callback) (pm);
 			/* poll clear stall done thread */
 			pm = &udev->ctrl_xfer[1]->
 			    xroot->done_m[0].hdr;
 			(pm->pm_callback) (pm);
 		}
 
 		/* poll done thread */
 		pm = &xroot->done_m[0].hdr;
 		(pm->pm_callback) (pm);
 
 		USB_BUS_UNLOCK(xroot->bus);
 
 		/* restore transfer mutex */
 		while (drop_xfer--)
 			mtx_lock(xroot->xfer_mtx);
 
 		/* restore BUS mutex */
 		while (drop_bus--)
 			mtx_lock(&bus->bus_mtx);
 
 		/* restore BUS spin mutex */
 		while (drop_bus_spin--)
 			mtx_lock_spin(&bus->bus_spin_lock);
 	}
 }
 
 static void
 usbd_get_std_packet_size(struct usb_std_packet_size *ptr,
     uint8_t type, enum usb_dev_speed speed)
 {
 	static const uint16_t intr_range_max[USB_SPEED_MAX] = {
 		[USB_SPEED_LOW] = 8,
 		[USB_SPEED_FULL] = 64,
 		[USB_SPEED_HIGH] = 1024,
 		[USB_SPEED_VARIABLE] = 1024,
 		[USB_SPEED_SUPER] = 1024,
 	};
 
 	static const uint16_t isoc_range_max[USB_SPEED_MAX] = {
 		[USB_SPEED_LOW] = 0,	/* invalid */
 		[USB_SPEED_FULL] = 1023,
 		[USB_SPEED_HIGH] = 1024,
 		[USB_SPEED_VARIABLE] = 3584,
 		[USB_SPEED_SUPER] = 1024,
 	};
 
 	static const uint16_t control_min[USB_SPEED_MAX] = {
 		[USB_SPEED_LOW] = 8,
 		[USB_SPEED_FULL] = 8,
 		[USB_SPEED_HIGH] = 64,
 		[USB_SPEED_VARIABLE] = 512,
 		[USB_SPEED_SUPER] = 512,
 	};
 
 	static const uint16_t bulk_min[USB_SPEED_MAX] = {
 		[USB_SPEED_LOW] = 8,
 		[USB_SPEED_FULL] = 8,
 		[USB_SPEED_HIGH] = 512,
 		[USB_SPEED_VARIABLE] = 512,
 		[USB_SPEED_SUPER] = 1024,
 	};
 
 	uint16_t temp;
 
 	memset(ptr, 0, sizeof(*ptr));
 
 	switch (type) {
 	case UE_INTERRUPT:
 		ptr->range.max = intr_range_max[speed];
 		break;
 	case UE_ISOCHRONOUS:
 		ptr->range.max = isoc_range_max[speed];
 		break;
 	default:
 		if (type == UE_BULK)
 			temp = bulk_min[speed];
 		else /* UE_CONTROL */
 			temp = control_min[speed];
 
 		/* default is fixed */
 		ptr->fixed[0] = temp;
 		ptr->fixed[1] = temp;
 		ptr->fixed[2] = temp;
 		ptr->fixed[3] = temp;
 
 		if (speed == USB_SPEED_FULL) {
 			/* multiple sizes */
 			ptr->fixed[1] = 16;
 			ptr->fixed[2] = 32;
 			ptr->fixed[3] = 64;
 		}
 		if ((speed == USB_SPEED_VARIABLE) &&
 		    (type == UE_BULK)) {
 			/* multiple sizes */
 			ptr->fixed[2] = 1024;
 			ptr->fixed[3] = 1536;
 		}
 		break;
 	}
 }
 
 void	*
 usbd_xfer_softc(struct usb_xfer *xfer)
 {
 	return (xfer->priv_sc);
 }
 
 void *
 usbd_xfer_get_priv(struct usb_xfer *xfer)
 {
 	return (xfer->priv_fifo);
 }
 
 void
 usbd_xfer_set_priv(struct usb_xfer *xfer, void *ptr)
 {
 	xfer->priv_fifo = ptr;
 }
 
 uint8_t
 usbd_xfer_state(struct usb_xfer *xfer)
 {
 	return (xfer->usb_state);
 }
 
 void
 usbd_xfer_set_flag(struct usb_xfer *xfer, int flag)
 {
 	switch (flag) {
 		case USB_FORCE_SHORT_XFER:
 			xfer->flags.force_short_xfer = 1;
 			break;
 		case USB_SHORT_XFER_OK:
 			xfer->flags.short_xfer_ok = 1;
 			break;
 		case USB_MULTI_SHORT_OK:
 			xfer->flags.short_frames_ok = 1;
 			break;
 		case USB_MANUAL_STATUS:
 			xfer->flags.manual_status = 1;
 			break;
 	}
 }
 
 void
 usbd_xfer_clr_flag(struct usb_xfer *xfer, int flag)
 {
 	switch (flag) {
 		case USB_FORCE_SHORT_XFER:
 			xfer->flags.force_short_xfer = 0;
 			break;
 		case USB_SHORT_XFER_OK:
 			xfer->flags.short_xfer_ok = 0;
 			break;
 		case USB_MULTI_SHORT_OK:
 			xfer->flags.short_frames_ok = 0;
 			break;
 		case USB_MANUAL_STATUS:
 			xfer->flags.manual_status = 0;
 			break;
 	}
 }
 
 /*
  * The following function returns in milliseconds when the isochronous
  * transfer was completed by the hardware. The returned value wraps
  * around 65536 milliseconds.
  */
 uint16_t
 usbd_xfer_get_timestamp(struct usb_xfer *xfer)
 {
 	return (xfer->isoc_time_complete);
 }
 
 /*
  * The following function returns non-zero if the max packet size
  * field was clamped to a valid value. Else it returns zero.
  */
 uint8_t
 usbd_xfer_maxp_was_clamped(struct usb_xfer *xfer)
 {
 	return (xfer->flags_int.maxp_was_clamped);
 }
Index: projects/clang900-import/sys/fs/msdosfs/msdosfs_denode.c
===================================================================
--- projects/clang900-import/sys/fs/msdosfs/msdosfs_denode.c	(revision 352586)
+++ projects/clang900-import/sys/fs/msdosfs/msdosfs_denode.c	(revision 352587)
@@ -1,616 +1,617 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_denode.c,v 1.28 1998/02/10 14:10:00 mrg Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/clock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 static MALLOC_DEFINE(M_MSDOSFSNODE, "msdosfs_node", "MSDOSFS vnode private part");
 
 static int
 de_vncmpf(struct vnode *vp, void *arg)
 {
 	struct denode *de;
 	uint64_t *a;
 
 	a = arg;
 	de = VTODE(vp);
-	return (de->de_inode != *a);
+	return (de->de_inode != *a) || (de->de_refcnt <= 0);
 }
 
 /*
  * If deget() succeeds it returns with the gotten denode locked().
  *
  * pmp	     - address of msdosfsmount structure of the filesystem containing
  *	       the denode of interest.  The address of
  *	       the msdosfsmount structure are used.
  * dirclust  - which cluster bp contains, if dirclust is 0 (root directory)
  *	       diroffset is relative to the beginning of the root directory,
  *	       otherwise it is cluster relative.
  * diroffset - offset past begin of cluster of denode we want
  * depp	     - returns the address of the gotten denode.
  */
 int
 deget(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset,
     struct denode **depp)
 {
 	int error;
 	uint64_t inode;
 	struct mount *mntp = pmp->pm_mountp;
 	struct direntry *direntptr;
 	struct denode *ldep;
 	struct vnode *nvp, *xvp;
 	struct buf *bp;
 
 #ifdef MSDOSFS_DEBUG
 	printf("deget(pmp %p, dirclust %lu, diroffset %lx, depp %p)\n",
 	    pmp, dirclust, diroffset, depp);
 #endif
 
 	/*
 	 * On FAT32 filesystems, root is a (more or less) normal
 	 * directory
 	 */
 	if (FAT32(pmp) && dirclust == MSDOSFSROOT)
 		dirclust = pmp->pm_rootdirblk;
 
 	/*
 	 * See if the denode is in the denode cache. Use the location of
 	 * the directory entry to compute the hash value. For subdir use
 	 * address of "." entry. For root dir (if not FAT32) use cluster
 	 * MSDOSFSROOT, offset MSDOSFSROOT_OFS
 	 *
-	 * NOTE: The check for de_refcnt > 0 below insures the denode being
-	 * examined does not represent an unlinked but still open file.
+	 * NOTE: de_vncmpf will explicitly skip any denodes that do not have
+	 * a de_refcnt > 0.  This insures that that we do not attempt to use
+	 * a denode that represents an unlinked but still open file.
 	 * These files are not to be accessible even when the directory
 	 * entry that represented the file happens to be reused while the
 	 * deleted file is still open.
 	 */
 	inode = (uint64_t)pmp->pm_bpcluster * dirclust + diroffset;
 
 	error = vfs_hash_get(mntp, inode, LK_EXCLUSIVE, curthread, &nvp,
 	    de_vncmpf, &inode);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*depp = VTODE(nvp);
 		KASSERT((*depp)->de_dirclust == dirclust, ("wrong dirclust"));
 		KASSERT((*depp)->de_diroffset == diroffset, ("wrong diroffset"));
 		return (0);
 	}
 	ldep = malloc(sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK | M_ZERO);
 
 	/*
 	 * Directory entry was not in cache, have to create a vnode and
 	 * copy it from the passed disk buffer.
 	 */
 	/* getnewvnode() does a VREF() on the vnode */
 	error = getnewvnode("msdosfs", mntp, &msdosfs_vnodeops, &nvp);
 	if (error) {
 		*depp = NULL;
 		free(ldep, M_MSDOSFSNODE);
 		return error;
 	}
 	nvp->v_data = ldep;
 	ldep->de_vnode = nvp;
 	ldep->de_flag = 0;
 	ldep->de_dirclust = dirclust;
 	ldep->de_diroffset = diroffset;
 	ldep->de_inode = inode;
 	lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL);
 	fc_purge(ldep, 0);	/* init the FAT cache for this denode */
 	error = insmntque(nvp, mntp);
 	if (error != 0) {
 		free(ldep, M_MSDOSFSNODE);
 		*depp = NULL;
 		return (error);
 	}
 	error = vfs_hash_insert(nvp, inode, LK_EXCLUSIVE, curthread, &xvp,
 	    de_vncmpf, &inode);
 	if (error) {
 		*depp = NULL;
 		return (error);
 	}
 	if (xvp != NULL) {
 		*depp = xvp->v_data;
 		return (0);
 	}
 
 	ldep->de_pmp = pmp;
 	ldep->de_refcnt = 1;
 	/*
 	 * Copy the directory entry into the denode area of the vnode.
 	 */
 	if ((dirclust == MSDOSFSROOT
 	     || (FAT32(pmp) && dirclust == pmp->pm_rootdirblk))
 	    && diroffset == MSDOSFSROOT_OFS) {
 		/*
 		 * Directory entry for the root directory. There isn't one,
 		 * so we manufacture one. We should probably rummage
 		 * through the root directory and find a label entry (if it
 		 * exists), and then use the time and date from that entry
 		 * as the time and date for the root denode.
 		 */
 		nvp->v_vflag |= VV_ROOT; /* should be further down XXX */
 
 		ldep->de_Attributes = ATTR_DIRECTORY;
 		ldep->de_LowerCase = 0;
 		if (FAT32(pmp))
 			ldep->de_StartCluster = pmp->pm_rootdirblk;
 			/* de_FileSize will be filled in further down */
 		else {
 			ldep->de_StartCluster = MSDOSFSROOT;
 			ldep->de_FileSize = pmp->pm_rootdirsize * DEV_BSIZE;
 		}
 		/*
 		 * fill in time and date so that fattime2timespec() doesn't
 		 * spit up when called from msdosfs_getattr() with root
 		 * denode
 		 */
 		ldep->de_CHun = 0;
 		ldep->de_CTime = 0x0000;	/* 00:00:00	 */
 		ldep->de_CDate = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT)
 		    | (1 << DD_DAY_SHIFT);
 		/* Jan 1, 1980	 */
 		ldep->de_ADate = ldep->de_CDate;
 		ldep->de_MTime = ldep->de_CTime;
 		ldep->de_MDate = ldep->de_CDate;
 		/* leave the other fields as garbage */
 	} else {
 		error = readep(pmp, dirclust, diroffset, &bp, &direntptr);
 		if (error) {
 			/*
 			 * The denode does not contain anything useful, so
 			 * it would be wrong to leave it on its hash chain.
 			 * Arrange for vput() to just forget about it.
 			 */
 			ldep->de_Name[0] = SLOT_DELETED;
 
 			vput(nvp);
 			*depp = NULL;
 			return (error);
 		}
 		(void)DE_INTERNALIZE(ldep, direntptr);
 		brelse(bp);
 	}
 
 	/*
 	 * Fill in a few fields of the vnode and finish filling in the
 	 * denode.  Then return the address of the found denode.
 	 */
 	if (ldep->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Since DOS directory entries that describe directories
 		 * have 0 in the filesize field, we take this opportunity
 		 * to find out the length of the directory and plug it into
 		 * the denode structure.
 		 */
 		u_long size;
 
 		/*
 		 * XXX it sometimes happens that the "." entry has cluster
 		 * number 0 when it shouldn't.  Use the actual cluster number
 		 * instead of what is written in directory entry.
 		 */
 		if (diroffset == 0 && ldep->de_StartCluster != dirclust) {
 #ifdef MSDOSFS_DEBUG
 			printf("deget(): \".\" entry at clust %lu != %lu\n",
 			    dirclust, ldep->de_StartCluster);
 #endif
 			ldep->de_StartCluster = dirclust;
 		}
 
 		nvp->v_type = VDIR;
 		if (ldep->de_StartCluster != MSDOSFSROOT) {
 			error = pcbmap(ldep, 0xffff, 0, &size, 0);
 			if (error == E2BIG) {
 				ldep->de_FileSize = de_cn2off(pmp, size);
 				error = 0;
 			} else {
 #ifdef MSDOSFS_DEBUG
 				printf("deget(): pcbmap returned %d\n", error);
 #endif
 			}
 		}
 	} else
 		nvp->v_type = VREG;
 	ldep->de_modrev = init_va_filerev();
 	*depp = ldep;
 	return (0);
 }
 
 int
 deupdat(struct denode *dep, int waitfor)
 {
 	struct direntry dir;
 	struct timespec ts;
 	struct buf *bp;
 	struct direntry *dirp;
 	int error;
 
 	if (DETOV(dep)->v_mount->mnt_flag & MNT_RDONLY) {
 		dep->de_flag &= ~(DE_UPDATE | DE_CREATE | DE_ACCESS |
 		    DE_MODIFIED);
 		return (0);
 	}
 	vfs_timestamp(&ts);
 	DETIMES(dep, &ts, &ts, &ts);
 	if ((dep->de_flag & DE_MODIFIED) == 0 && waitfor == 0)
 		return (0);
 	dep->de_flag &= ~DE_MODIFIED;
 	if (DETOV(dep)->v_vflag & VV_ROOT)
 		return (EINVAL);
 	if (dep->de_refcnt <= 0)
 		return (0);
 	error = readde(dep, &bp, &dirp);
 	if (error)
 		return (error);
 	DE_EXTERNALIZE(&dir, dep);
 	if (bcmp(dirp, &dir, sizeof(dir)) == 0) {
 		if (waitfor == 0 || (bp->b_flags & B_DELWRI) == 0) {
 			brelse(bp);
 			return (0);
 		}
 	} else
 		*dirp = dir;
 	if ((DETOV(dep)->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 		bp->b_flags |= B_CLUSTEROK;
 	if (waitfor)
 		error = bwrite(bp);
 	else if (vm_page_count_severe() || buf_dirty_count_severe())
 		bawrite(bp);
 	else
 		bdwrite(bp);
 	return (error);
 }
 
 /*
  * Truncate the file described by dep to the length specified by length.
  */
 int
 detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred)
 {
 	int error;
 	int allerror;
 	u_long eofentry;
 	u_long chaintofree;
 	daddr_t bn;
 	int boff;
 	int isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	struct buf *bp;
 	struct msdosfsmount *pmp = dep->de_pmp;
 
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): file %s, length %lu, flags %x\n", dep->de_Name, length, flags);
 #endif
 
 	/*
 	 * Disallow attempts to truncate the root directory since it is of
 	 * fixed size.  That's just the way dos filesystems are.  We use
 	 * the VROOT bit in the vnode because checking for the directory
 	 * bit and a startcluster of 0 in the denode is not adequate to
 	 * recognize the root directory at this point in a file or
 	 * directory's life.
 	 */
 	if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp)) {
 #ifdef MSDOSFS_DEBUG
 		printf("detrunc(): can't truncate root directory, clust %ld, offset %ld\n",
 		    dep->de_dirclust, dep->de_diroffset);
 #endif
 		return (EINVAL);
 	}
 
 	if (dep->de_FileSize < length) {
 		vnode_pager_setsize(DETOV(dep), length);
 		return deextend(dep, length, cred);
 	}
 
 	/*
 	 * If the desired length is 0 then remember the starting cluster of
 	 * the file and set the StartCluster field in the directory entry
 	 * to 0.  If the desired length is not zero, then get the number of
 	 * the last cluster in the shortened file.  Then get the number of
 	 * the first cluster in the part of the file that is to be freed.
 	 * Then set the next cluster pointer in the last cluster of the
 	 * file to CLUST_EOFE.
 	 */
 	if (length == 0) {
 		chaintofree = dep->de_StartCluster;
 		dep->de_StartCluster = 0;
 		eofentry = ~0;
 	} else {
 		error = pcbmap(dep, de_clcount(pmp, length) - 1, 0,
 			       &eofentry, 0);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): pcbmap fails %d\n", error);
 #endif
 			return (error);
 		}
 	}
 
 	fc_purge(dep, de_clcount(pmp, length));
 
 	/*
 	 * If the new length is not a multiple of the cluster size then we
 	 * must zero the tail end of the new last cluster in case it
 	 * becomes part of the file again because of a seek.
 	 */
 	if ((boff = length & pmp->pm_crbomask) != 0) {
 		if (isadir) {
 			bn = cntobn(pmp, eofentry);
 			error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			    NOCRED, &bp);
 		} else {
 			error = bread(DETOV(dep), de_cluster(pmp, length),
 			    pmp->pm_bpcluster, cred, &bp);
 		}
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): bread fails %d\n", error);
 #endif
 			return (error);
 		}
 		memset(bp->b_data + boff, 0, pmp->pm_bpcluster - boff);
 		if ((flags & IO_SYNC) != 0)
 			bwrite(bp);
 		else
 			bdwrite(bp);
 	}
 
 	/*
 	 * Write out the updated directory entry.  Even if the update fails
 	 * we free the trailing clusters.
 	 */
 	dep->de_FileSize = length;
 	if (!isadir)
 		dep->de_flag |= DE_UPDATE | DE_MODIFIED;
 	allerror = vtruncbuf(DETOV(dep), length, pmp->pm_bpcluster);
 #ifdef MSDOSFS_DEBUG
 	if (allerror)
 		printf("detrunc(): vtruncbuf error %d\n", allerror);
 #endif
 	error = deupdat(dep, !DOINGASYNC((DETOV(dep))));
 	if (error != 0 && allerror == 0)
 		allerror = error;
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): allerror %d, eofentry %lu\n",
 	       allerror, eofentry);
 #endif
 
 	/*
 	 * If we need to break the cluster chain for the file then do it
 	 * now.
 	 */
 	if (eofentry != ~0) {
 		error = fatentry(FAT_GET_AND_SET, pmp, eofentry,
 				 &chaintofree, CLUST_EOFE);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): fatentry errors %d\n", error);
 #endif
 			return (error);
 		}
 		fc_setcache(dep, FC_LASTFC, de_cluster(pmp, length - 1),
 			    eofentry);
 	}
 
 	/*
 	 * Now free the clusters removed from the file because of the
 	 * truncation.
 	 */
 	if (chaintofree != 0 && !MSDOSFSEOF(pmp, chaintofree))
 		freeclusterchain(pmp, chaintofree);
 
 	return (allerror);
 }
 
 /*
  * Extend the file described by dep to length specified by length.
  */
 int
 deextend(struct denode *dep, u_long length, struct ucred *cred)
 {
 	struct msdosfsmount *pmp = dep->de_pmp;
 	u_long count;
 	int error;
 
 	/*
 	 * The root of a DOS filesystem cannot be extended.
 	 */
 	if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp))
 		return (EINVAL);
 
 	/*
 	 * Directories cannot be extended.
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY)
 		return (EISDIR);
 
 	if (length <= dep->de_FileSize)
 		panic("deextend: file too large");
 
 	/*
 	 * Compute the number of clusters to allocate.
 	 */
 	count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize);
 	if (count > 0) {
 		if (count > pmp->pm_freeclustercount)
 			return (ENOSPC);
 		error = extendfile(dep, count, NULL, NULL, DE_CLEAR);
 		if (error) {
 			/* truncate the added clusters away again */
 			(void) detrunc(dep, dep->de_FileSize, 0, cred);
 			return (error);
 		}
 	}
 	dep->de_FileSize = length;
 	dep->de_flag |= DE_UPDATE | DE_MODIFIED;
 	return (deupdat(dep, !DOINGASYNC(DETOV(dep))));
 }
 
 /*
  * Move a denode to its correct hash queue after the file it represents has
  * been moved to a new directory.
  */
 void
 reinsert(struct denode *dep)
 {
 	struct vnode *vp;
 
 	/*
 	 * Fix up the denode cache.  If the denode is for a directory,
 	 * there is nothing to do since the hash is based on the starting
 	 * cluster of the directory file and that hasn't changed.  If for a
 	 * file the hash is based on the location of the directory entry,
 	 * so we must remove it from the cache and re-enter it with the
 	 * hash based on the new location of the directory entry.
 	 */
 #if 0
 	if (dep->de_Attributes & ATTR_DIRECTORY)
 		return;
 #endif
 	vp = DETOV(dep);
 	dep->de_inode = (uint64_t)dep->de_pmp->pm_bpcluster * dep->de_dirclust +
 	    dep->de_diroffset;
 	vfs_hash_rehash(vp, dep->de_inode);
 }
 
 int
 msdosfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n",
 	    dep, dep->de_Name, dep->de_refcnt);
 #endif
 
 	/*
 	 * Remove the denode from its hash chain.
 	 */
 	vfs_hash_remove(vp);
 	/*
 	 * Purge old data structures associated with the denode.
 	 */
 #if 0 /* XXX */
 	dep->de_flag = 0;
 #endif
 	free(dep, M_MSDOSFSNODE);
 	vp->v_data = NULL;
 
 	return (0);
 }
 
 int
 msdosfs_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]);
 #endif
 
 	/*
 	 * Ignore denodes related to stale file handles.
 	 */
 	if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY)
 		goto out;
 
 	/*
 	 * If the file has been deleted and it is on a read/write
 	 * filesystem, then truncate the file, and mark the directory slot
 	 * as empty.  (This may not be necessary for the dos filesystem.)
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %llx, MNT_RDONLY %llx\n",
 	    dep, dep->de_refcnt, (unsigned long long)vp->v_mount->mnt_flag,
 	    (unsigned long long)MNT_RDONLY);
 #endif
 	if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 		error = detrunc(dep, (u_long) 0, 0, NOCRED);
 		dep->de_flag |= DE_UPDATE;
 		dep->de_Name[0] = SLOT_DELETED;
 	}
 	deupdat(dep, 0);
 
 out:
 	/*
 	 * If we are done with the denode, reclaim it
 	 * so that it can be reused immediately.
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n",
 	       vrefcnt(vp), dep->de_Name[0]);
 #endif
 	if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY)
 		vrecycle(vp);
 	return (error);
 }
Index: projects/clang900-import/sys/kern/imgact_elf.c
===================================================================
--- projects/clang900-import/sys/kern/imgact_elf.c	(revision 352586)
+++ projects/clang900-import/sys/kern/imgact_elf.c	(revision 352587)
@@ -1,2744 +1,2765 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2017 Dell EMC
  * Copyright (c) 2000-2001, 2003 David O'Brien
  * Copyright (c) 1995-1996 Søren Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/compressor.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/ptrace.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/eventhandler.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/elf.h>
 #include <machine/md_var.h>
 
 #define ELF_NOTE_ROUNDSIZE	4
 #define OLD_EI_BRAND	8
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
     const char *interp, int32_t *osrel, uint32_t *fctl0);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry);
 static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
     caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot);
 static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
 static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note,
     int32_t *osrel);
 static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
 static boolean_t __elfN(check_note)(struct image_params *imgp,
     Elf_Brandnote *checknote, int32_t *osrel, uint32_t *fctl0);
 static vm_prot_t __elfN(trans_prot)(Elf_Word);
 static Elf_Word __elfN(untrans_prot)(vm_prot_t);
 
 SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
     "");
 
 #define	CORE_BUF_SIZE	(16 * 1024)
 
 int __elfN(fallback_brand) = -1;
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
 
 static int elf_legacy_coredump = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
     &elf_legacy_coredump, 0,
     "include all and only RW pages in core dumps");
 
 int __elfN(nxstack) =
 #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ || \
     (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__) || \
     defined(__riscv)
 	1;
 #else
 	0;
 #endif
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
 
 #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__))
 int i386_read_exec = 0;
 SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
     "enable execution from readable segments");
 #endif
 
+static u_long __elfN(pie_base) = ET_DYN_LOAD_ADDR;
+static int
+sysctl_pie_base(SYSCTL_HANDLER_ARGS)
+{
+	u_long val;
+	int error;
+
+	val = __elfN(pie_base);
+	error = sysctl_handle_long(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if ((val & PAGE_MASK) != 0)
+		return (EINVAL);
+	__elfN(pie_base) = val;
+	return (0);
+}
+SYSCTL_PROC(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, pie_base,
+    CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
+    sysctl_pie_base, "LU",
+    "PIE load base without randomization");
+
 SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr, CTLFLAG_RW, 0,
     "");
 #define	ASLR_NODE_OID	__CONCAT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), _aslr)
 
 static int __elfN(aslr_enabled) = 0;
 SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN,
     &__elfN(aslr_enabled), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
     ": enable address map randomization");
 
 static int __elfN(pie_aslr_enabled) = 0;
 SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN,
     &__elfN(pie_aslr_enabled), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
     ": enable address map randomization for PIE binaries");
 
 static int __elfN(aslr_honor_sbrk) = 1;
 SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW,
     &__elfN(aslr_honor_sbrk), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used");
 
 static int __elfN(aslr_stack_gap) = 3;
 SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack_gap, CTLFLAG_RW,
     &__elfN(aslr_stack_gap), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
     ": maximum percentage of main stack to waste on a random gap");
 
 static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
 #define	aligned(a, t)	(rounddown2((u_long)(a), sizeof(t)) == (u_long)(a))
 
 static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
 
 Elf_Brandnote __elfN(freebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(FREEBSD_ABI_VENDOR),
 	.hdr.n_descsz	= sizeof(int32_t),
 	.hdr.n_type	= NT_FREEBSD_ABI_TAG,
 	.vendor		= FREEBSD_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= __elfN(freebsd_trans_osrel)
 };
 
 static bool
 __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
 {
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 	*osrel = *(const int32_t *)(p);
 
 	return (true);
 }
 
 static const char GNU_ABI_VENDOR[] = "GNU";
 static int GNU_KFREEBSD_ABI_DESC = 3;
 
 Elf_Brandnote __elfN(kfreebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
 	.hdr.n_type	= 1,
 	.vendor		= GNU_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= kfreebsd_trans_osrel
 };
 
 static bool
 kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
 {
 	const Elf32_Word *desc;
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 
 	desc = (const Elf32_Word *)p;
 	if (desc[0] != GNU_KFREEBSD_ABI_DESC)
 		return (false);
 
 	/*
 	 * Debian GNU/kFreeBSD embed the earliest compatible kernel version
 	 * (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
 	 */
 	*osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
 
 	return (true);
 }
 
 int
 __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == NULL) {
 			elf_brand_list[i] = entry;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS) {
 		printf("WARNING: %s: could not insert brandinfo entry: %p\n",
 			__func__, entry);
 		return (-1);
 	}
 	return (0);
 }
 
 int
 __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == entry) {
 			elf_brand_list[i] = NULL;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(brand_inuse)(Elf_Brandinfo *entry)
 {
 	struct proc *p;
 	int rval = FALSE;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_sysent == entry->sysvec) {
 			rval = TRUE;
 			break;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	return (rval);
 }
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
     int32_t *osrel, uint32_t *fctl0)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	Elf_Brandinfo *bi, *bi_m;
 	boolean_t ret;
 	int i, interp_name_len;
 
 	interp_name_len = interp != NULL ? strlen(interp) + 1 : 0;
 
 	/*
 	 * We support four types of branding -- (1) the ELF EI_OSABI field
 	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
 	 * branding w/in the ELF header, (3) path of the `interp_path'
 	 * field, and (4) the ".note.ABI-tag" ELF section.
 	 */
 
 	/* Look for an ".note.ABI-tag" ELF section */
 	bi_m = NULL;
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL)
 			continue;
 		if (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)
 			continue;
 		if (hdr->e_machine == bi->machine && (bi->flags &
 		    (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
 			ret = __elfN(check_note)(imgp, bi->brand_note, osrel,
 			    fctl0);
 			/* Give brand a chance to veto check_note's guess */
 			if (ret && bi->header_supported)
 				ret = bi->header_supported(imgp);
 			/*
 			 * If note checker claimed the binary, but the
 			 * interpreter path in the image does not
 			 * match default one for the brand, try to
 			 * search for other brands with the same
 			 * interpreter.  Either there is better brand
 			 * with the right interpreter, or, failing
 			 * this, we return first brand which accepted
 			 * our note and, optionally, header.
 			 */
 			if (ret && bi_m == NULL && interp != NULL &&
 			    (bi->interp_path == NULL ||
 			    (strlen(bi->interp_path) + 1 != interp_name_len ||
 			    strncmp(interp, bi->interp_path, interp_name_len)
 			    != 0))) {
 				bi_m = bi;
 				ret = 0;
 			}
 			if (ret)
 				return (bi);
 		}
 	}
 	if (bi_m != NULL)
 		return (bi_m);
 
 	/* If the executable has a brand, search for it in the brand list. */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
 		    (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
 			continue;
 		if (hdr->e_machine == bi->machine &&
 		    (hdr->e_ident[EI_OSABI] == bi->brand ||
 		    (bi->compat_3_brand != NULL &&
 		    strcmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
 		    bi->compat_3_brand) == 0))) {
 			/* Looks good, but give brand a chance to veto */
 			if (bi->header_supported == NULL ||
 			    bi->header_supported(imgp)) {
 				/*
 				 * Again, prefer strictly matching
 				 * interpreter path.
 				 */
 				if (interp_name_len == 0 &&
 				    bi->interp_path == NULL)
 					return (bi);
 				if (bi->interp_path != NULL &&
 				    strlen(bi->interp_path) + 1 ==
 				    interp_name_len && strncmp(interp,
 				    bi->interp_path, interp_name_len) == 0)
 					return (bi);
 				if (bi_m == NULL)
 					bi_m = bi;
 			}
 		}
 	}
 	if (bi_m != NULL)
 		return (bi_m);
 
 	/* No known brand, see if the header is recognized by any brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY ||
 		    bi->header_supported == NULL)
 			continue;
 		if (hdr->e_machine == bi->machine) {
 			ret = bi->header_supported(imgp);
 			if (ret)
 				return (bi);
 		}
 	}
 
 	/* Lacking a known brand, search for a recognized interpreter. */
 	if (interp != NULL) {
 		for (i = 0; i < MAX_BRANDS; i++) {
 			bi = elf_brand_list[i];
 			if (bi == NULL || (bi->flags &
 			    (BI_BRAND_NOTE_MANDATORY | BI_BRAND_ONLY_STATIC))
 			    != 0)
 				continue;
 			if (hdr->e_machine == bi->machine &&
 			    bi->interp_path != NULL &&
 			    /* ELF image p_filesz includes terminating zero */
 			    strlen(bi->interp_path) + 1 == interp_name_len &&
 			    strncmp(interp, bi->interp_path, interp_name_len)
 			    == 0 && (bi->header_supported == NULL ||
 			    bi->header_supported(imgp)))
 				return (bi);
 		}
 	}
 
 	/* Lacking a recognized interpreter, try the default brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
 		    (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
 			continue;
 		if (hdr->e_machine == bi->machine &&
 		    __elfN(fallback_brand) == bi->brand &&
 		    (bi->header_supported == NULL ||
 		    bi->header_supported(imgp)))
 			return (bi);
 	}
 	return (NULL);
 }
 
 static int
 __elfN(check_header)(const Elf_Ehdr *hdr)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	if (!IS_ELF(*hdr) ||
 	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
 	    hdr->e_version != ELF_TARG_VER)
 		return (ENOEXEC);
 
 	/*
 	 * Make sure we have at least one brand for this machine.
 	 */
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && bi->machine == hdr->e_machine)
 			break;
 	}
 	if (i == MAX_BRANDS)
 		return (ENOEXEC);
 
 	return (0);
 }
 
 static int
 __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	int error;
 	vm_offset_t off;
 
 	/*
 	 * Create the page if it doesn't exist yet. Ignore errors.
 	 */
 	vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) -
 	    trunc_page(start), VM_PROT_ALL, VM_PROT_ALL, MAP_CHECK_EXCL);
 
 	/*
 	 * Find the page from the underlying object.
 	 */
 	if (object != NULL) {
 		sf = vm_imgact_map_page(object, offset);
 		if (sf == NULL)
 			return (KERN_FAILURE);
 		off = offset - trunc_page(offset);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
 		    end - start);
 		vm_imgact_unmap_page(sf);
 		if (error != 0)
 			return (KERN_FAILURE);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(map_insert)(struct image_params *imgp, vm_map_t map, vm_object_t object,
     vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot,
     int cow)
 {
 	struct sf_buf *sf;
 	vm_offset_t off;
 	vm_size_t sz;
 	int error, locked, rv;
 
 	if (start != trunc_page(start)) {
 		rv = __elfN(map_partial)(map, object, offset, start,
 		    round_page(start), prot);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		offset += round_page(start) - start;
 		start = round_page(start);
 	}
 	if (end != round_page(end)) {
 		rv = __elfN(map_partial)(map, object, offset +
 		    trunc_page(end) - start, trunc_page(end), end, prot);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		end = trunc_page(end);
 	}
 	if (start >= end)
 		return (KERN_SUCCESS);
 	if ((offset & PAGE_MASK) != 0) {
 		/*
 		 * The mapping is not page aligned.  This means that we have
 		 * to copy the data.
 		 */
 		rv = vm_map_fixed(map, NULL, 0, start, end - start,
 		    prot | VM_PROT_WRITE, VM_PROT_ALL, MAP_CHECK_EXCL);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		if (object == NULL)
 			return (KERN_SUCCESS);
 		for (; start < end; start += sz) {
 			sf = vm_imgact_map_page(object, offset);
 			if (sf == NULL)
 				return (KERN_FAILURE);
 			off = offset - trunc_page(offset);
 			sz = end - start;
 			if (sz > PAGE_SIZE - off)
 				sz = PAGE_SIZE - off;
 			error = copyout((caddr_t)sf_buf_kva(sf) + off,
 			    (caddr_t)start, sz);
 			vm_imgact_unmap_page(sf);
 			if (error != 0)
 				return (KERN_FAILURE);
 			offset += sz;
 		}
 	} else {
 		vm_object_reference(object);
 		rv = vm_map_fixed(map, object, offset, start, end - start,
 		    prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL |
 		    (object != NULL ? MAP_VN_EXEC : 0));
 		if (rv != KERN_SUCCESS) {
 			locked = VOP_ISLOCKED(imgp->vp);
 			VOP_UNLOCK(imgp->vp, 0);
 			vm_object_deallocate(object);
 			vn_lock(imgp->vp, locked | LK_RETRY);
 			return (rv);
 		} else if (object != NULL) {
 			MPASS(imgp->vp->v_object == object);
 			VOP_SET_TEXT_CHECKED(imgp->vp);
 		}
 	}
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
     caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	size_t map_len;
 	vm_map_t map;
 	vm_object_t object;
 	vm_offset_t map_addr;
 	int error, rv, cow;
 	size_t copy_len;
 	vm_ooffset_t file_addr;
 
 	/*
 	 * It's necessary to fail if the filsz + offset taken from the
 	 * header is greater than the actual file pager object's size.
 	 * If we were to allow this, then the vm_map_find() below would
 	 * walk right off the end of the file object and into the ether.
 	 *
 	 * While I'm here, might as well check for something else that
 	 * is invalid: filsz cannot be greater than memsz.
 	 */
 	if ((filsz != 0 && (off_t)filsz + offset > imgp->attr->va_size) ||
 	    filsz > memsz) {
 		uprintf("elf_load_section: truncated ELF file\n");
 		return (ENOEXEC);
 	}
 
 	object = imgp->object;
 	map = &imgp->proc->p_vmspace->vm_map;
 	map_addr = trunc_page((vm_offset_t)vmaddr);
 	file_addr = trunc_page(offset);
 
 	/*
 	 * We have two choices.  We can either clear the data in the last page
 	 * of an oversized mapping, or we can start the anon mapping a page
 	 * early and copy the initialized data into that first page.  We
 	 * choose the second.
 	 */
 	if (filsz == 0)
 		map_len = 0;
 	else if (memsz > filsz)
 		map_len = trunc_page(offset + filsz) - file_addr;
 	else
 		map_len = round_page(offset + filsz) - file_addr;
 
 	if (map_len != 0) {
 		/* cow flags: don't dump readonly sections in core */
 		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
 		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
 
 		rv = __elfN(map_insert)(imgp, map, object, file_addr,
 		    map_addr, map_addr + map_len, prot, cow);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 
 		/* we can stop now if we've covered it all */
 		if (memsz == filsz)
 			return (0);
 	}
 
 
 	/*
 	 * We have to get the remaining bit of the file into the first part
 	 * of the oversized map segment.  This is normally because the .data
 	 * segment in the file is extended to provide bss.  It's a neat idea
 	 * to try and save a page, but it's a pain in the behind to implement.
 	 */
 	copy_len = filsz == 0 ? 0 : (offset + filsz) - trunc_page(offset +
 	    filsz);
 	map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
 	map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
 
 	/* This had damn well better be true! */
 	if (map_len != 0) {
 		rv = __elfN(map_insert)(imgp, map, NULL, 0, map_addr,
 		    map_addr + map_len, prot, 0);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 	}
 
 	if (copy_len != 0) {
 		sf = vm_imgact_map_page(object, offset + filsz);
 		if (sf == NULL)
 			return (EIO);
 
 		/* send the page fragment to user space */
 		error = copyout((caddr_t)sf_buf_kva(sf), (caddr_t)map_addr,
 		    copy_len);
 		vm_imgact_unmap_page(sf);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Remove write access to the page if it was only granted by map_insert
 	 * to allow copyout.
 	 */
 	if ((prot & VM_PROT_WRITE) == 0)
 		vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
 		    map_len), prot, FALSE);
 
 	return (0);
 }
 
 static int
 __elfN(load_sections)(struct image_params *imgp, const Elf_Ehdr *hdr,
     const Elf_Phdr *phdr, u_long rbase, u_long *base_addrp)
 {
 	vm_prot_t prot;
 	u_long base_addr;
 	bool first;
 	int error, i;
 
 	ASSERT_VOP_LOCKED(imgp->vp, __func__);
 
 	base_addr = 0;
 	first = true;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0)
 			continue;
 
 		/* Loadable segment */
 		prot = __elfN(trans_prot)(phdr[i].p_flags);
 		error = __elfN(load_section)(imgp, phdr[i].p_offset,
 		    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
 		    phdr[i].p_memsz, phdr[i].p_filesz, prot);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Establish the base address if this is the first segment.
 		 */
 		if (first) {
   			base_addr = trunc_page(phdr[i].p_vaddr + rbase);
 			first = false;
 		}
 	}
 
 	if (base_addrp != NULL)
 		*base_addrp = base_addr;
 
 	return (0);
 }
 
 /*
  * Load the file "file" into memory.  It may be either a shared object
  * or an executable.
  *
  * The "addr" reference parameter is in/out.  On entry, it specifies
  * the address where a shared object should be loaded.  If the file is
  * an executable, this value is ignored.  On exit, "addr" specifies
  * where the file was actually loaded.
  *
  * The "entry" reference parameter is out only.  On exit, it specifies
  * the entry point for the loaded file.
  */
 static int
 __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
 	u_long *entry)
 {
 	struct {
 		struct nameidata nd;
 		struct vattr attr;
 		struct image_params image_params;
 	} *tempdata;
 	const Elf_Ehdr *hdr = NULL;
 	const Elf_Phdr *phdr = NULL;
 	struct nameidata *nd;
 	struct vattr *attr;
 	struct image_params *imgp;
 	u_long rbase;
 	u_long base_addr = 0;
 	int error;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * XXXJA: This check can go away once we are sufficiently confident
 	 * that the checks in namei() are correct.
 	 */
 	if (IN_CAPABILITY_MODE(curthread))
 		return (ECAPMODE);
 #endif
 
 	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK | M_ZERO);
 	nd = &tempdata->nd;
 	attr = &tempdata->attr;
 	imgp = &tempdata->image_params;
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->attr = attr;
 
 	NDINIT(nd, LOOKUP, ISOPEN | FOLLOW | LOCKSHARED | LOCKLEAF,
 	    UIO_SYSSPACE, file, curthread);
 	if ((error = namei(nd)) != 0) {
 		nd->ni_vp = NULL;
 		goto fail;
 	}
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd->ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto fail;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto fail;
 
 	imgp->object = nd->ni_vp->v_object;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	if ((error = __elfN(check_header)(hdr)) != 0)
 		goto fail;
 	if (hdr->e_type == ET_DYN)
 		rbase = *addr;
 	else if (hdr->e_type == ET_EXEC)
 		rbase = 0;
 	else {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	/* Only support headers that fit within first page for now      */
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	if (!aligned(phdr, Elf_Addr)) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	error = __elfN(load_sections)(imgp, hdr, phdr, rbase, &base_addr);
 	if (error != 0)
 		goto fail;
 
 	*addr = base_addr;
 	*entry = (unsigned long)hdr->e_entry + rbase;
 
 fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (nd->ni_vp) {
 		if (imgp->textset)
 			VOP_UNSET_TEXT_CHECKED(nd->ni_vp);
 		vput(nd->ni_vp);
 	}
 	free(tempdata, M_TEMP);
 
 	return (error);
 }
 
 static u_long
 __CONCAT(rnd_, __elfN(base))(vm_map_t map __unused, u_long minv, u_long maxv,
     u_int align)
 {
 	u_long rbase, res;
 
 	MPASS(vm_map_min(map) <= minv);
 	MPASS(maxv <= vm_map_max(map));
 	MPASS(minv < maxv);
 	MPASS(minv + align < maxv);
 	arc4rand(&rbase, sizeof(rbase), 0);
 	res = roundup(minv, (u_long)align) + rbase % (maxv - minv);
 	res &= ~((u_long)align - 1);
 	if (res >= maxv)
 		res -= align;
 	KASSERT(res >= minv,
 	    ("res %#lx < minv %#lx, maxv %#lx rbase %#lx",
 	    res, minv, maxv, rbase));
 	KASSERT(res < maxv,
 	    ("res %#lx > maxv %#lx, minv %#lx rbase %#lx",
 	    res, maxv, minv, rbase));
 	return (res);
 }
 
 static int
 __elfN(enforce_limits)(struct image_params *imgp, const Elf_Ehdr *hdr,
     const Elf_Phdr *phdr, u_long et_dyn_addr)
 {
 	struct vmspace *vmspace;
 	const char *err_str;
 	u_long text_size, data_size, total_size, text_addr, data_addr;
 	u_long seg_size, seg_addr;
 	int i;
 
 	err_str = NULL;
 	text_size = data_size = total_size = text_addr = data_addr = 0;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0)
 			continue;
 
 		seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
 		seg_size = round_page(phdr[i].p_memsz +
 		    phdr[i].p_vaddr + et_dyn_addr - seg_addr);
 
 		/*
 		 * Make the largest executable segment the official
 		 * text segment and all others data.
 		 *
 		 * Note that obreak() assumes that data_addr + data_size == end
 		 * of data load area, and the ELF file format expects segments
 		 * to be sorted by address.  If multiple data segments exist,
 		 * the last one will be used.
 		 */
 
 		if ((phdr[i].p_flags & PF_X) != 0 && text_size < seg_size) {
 			text_size = seg_size;
 			text_addr = seg_addr;
 		} else {
 			data_size = seg_size;
 			data_addr = seg_addr;
 		}
 		total_size += seg_size;
 	}
 	
 	if (data_addr == 0 && data_size == 0) {
 		data_addr = text_addr;
 		data_size = text_size;
 	}
 
 	/*
 	 * Check limits.  It should be safe to check the
 	 * limits after loading the segments since we do
 	 * not actually fault in all the segments pages.
 	 */
 	PROC_LOCK(imgp->proc);
 	if (data_size > lim_cur_proc(imgp->proc, RLIMIT_DATA))
 		err_str = "Data segment size exceeds process limit";
 	else if (text_size > maxtsiz)
 		err_str = "Text segment size exceeds system limit";
 	else if (total_size > lim_cur_proc(imgp->proc, RLIMIT_VMEM))
 		err_str = "Total segment size exceeds process limit";
 	else if (racct_set(imgp->proc, RACCT_DATA, data_size) != 0)
 		err_str = "Data segment size exceeds resource limit";
 	else if (racct_set(imgp->proc, RACCT_VMEM, total_size) != 0)
 		err_str = "Total segment size exceeds resource limit";
 	PROC_UNLOCK(imgp->proc);
 	if (err_str != NULL) {
 		uprintf("%s\n", err_str);
 		return (ENOMEM);
 	}
 
 	vmspace = imgp->proc->p_vmspace;
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
 
 	return (0);
 }
 
 static int
 __elfN(get_interp)(struct image_params *imgp, const Elf_Phdr *phdr,
     char **interpp, bool *free_interpp)
 {
 	struct thread *td;
 	char *interp;
 	int error, interp_name_len;
 
 	KASSERT(phdr->p_type == PT_INTERP,
 	    ("%s: p_type %u != PT_INTERP", __func__, phdr->p_type));
 	ASSERT_VOP_LOCKED(imgp->vp, __func__);
 
 	td = curthread;
 
 	/* Path to interpreter */
 	if (phdr->p_filesz < 2 || phdr->p_filesz > MAXPATHLEN) {
 		uprintf("Invalid PT_INTERP\n");
 		return (ENOEXEC);
 	}
 
 	interp_name_len = phdr->p_filesz;
 	if (phdr->p_offset > PAGE_SIZE ||
 	    interp_name_len > PAGE_SIZE - phdr->p_offset) {
 		/*
 		 * The vnode lock might be needed by the pagedaemon to
 		 * clean pages owned by the vnode.  Do not allow sleep
 		 * waiting for memory with the vnode locked, instead
 		 * try non-sleepable allocation first, and if it
 		 * fails, go to the slow path were we drop the lock
 		 * and do M_WAITOK.  A text reference prevents
 		 * modifications to the vnode content.
 		 */
 		interp = malloc(interp_name_len + 1, M_TEMP, M_NOWAIT);
 		if (interp == NULL) {
 			VOP_UNLOCK(imgp->vp, 0);
 			interp = malloc(interp_name_len + 1, M_TEMP, M_WAITOK);
 			vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		}
 
 		error = vn_rdwr(UIO_READ, imgp->vp, interp,
 		    interp_name_len, phdr->p_offset,
 		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 		    NOCRED, NULL, td);
 		if (error != 0) {
 			free(interp, M_TEMP);
 			uprintf("i/o error PT_INTERP %d\n", error);
 			return (error);
 		}
 		interp[interp_name_len] = '\0';
 
 		*interpp = interp;
 		*free_interpp = true;
 		return (0);
 	}
 
 	interp = __DECONST(char *, imgp->image_header) + phdr->p_offset;
 	if (interp[interp_name_len - 1] != '\0') {
 		uprintf("Invalid PT_INTERP\n");
 		return (ENOEXEC);
 	}
 
 	*interpp = interp;
 	*free_interpp = false;
 	return (0);
 }
 
 static int
 __elfN(load_interp)(struct image_params *imgp, const Elf_Brandinfo *brand_info,
     const char *interp, u_long *addr, u_long *entry)
 {
 	char *path;
 	int error;
 
 	if (brand_info->emul_path != NULL &&
 	    brand_info->emul_path[0] != '\0') {
 		path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		snprintf(path, MAXPATHLEN, "%s%s",
 		    brand_info->emul_path, interp);
 		error = __elfN(load_file)(imgp->proc, path, addr, entry);
 		free(path, M_TEMP);
 		if (error == 0)
 			return (0);
 	}
 
 	if (brand_info->interp_newpath != NULL &&
 	    (brand_info->interp_path == NULL ||
 	    strcmp(interp, brand_info->interp_path) == 0)) {
 		error = __elfN(load_file)(imgp->proc,
 		    brand_info->interp_newpath, addr, entry);
 		if (error == 0)
 			return (0);
 	}
 
 	error = __elfN(load_file)(imgp->proc, interp, addr, entry);
 	if (error == 0)
 		return (0);
 
 	uprintf("ELF interpreter %s not found, error %d\n", interp, error);
 	return (error);
 }
 
 /*
  * Impossible et_dyn_addr initial value indicating that the real base
  * must be calculated later with some randomization applied.
  */
 #define	ET_DYN_ADDR_RAND	1
 
 static int
 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 {
 	struct thread *td;
 	const Elf_Ehdr *hdr;
 	const Elf_Phdr *phdr;
 	Elf_Auxargs *elf_auxargs;
 	struct vmspace *vmspace;
 	vm_map_t map;
 	char *interp;
 	Elf_Brandinfo *brand_info;
 	struct sysentvec *sv;
 	u_long addr, baddr, et_dyn_addr, entry, proghdr;
 	u_long maxalign, mapsz, maxv, maxv1;
 	uint32_t fctl0;
 	int32_t osrel;
 	bool free_interp;
 	int error, i, n;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 
 	/*
 	 * Do we have a valid ELF header ?
 	 *
 	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
 	 * if particular brand doesn't support it.
 	 */
 	if (__elfN(check_header)(hdr) != 0 ||
 	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
 		return (-1);
 
 	/*
 	 * From here on down, we return an errno, not -1, as we've
 	 * detected an ELF file.
 	 */
 
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
 		/* Only support headers in first page for now */
 		uprintf("Program headers not in the first page\n");
 		return (ENOEXEC);
 	}
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); 
 	if (!aligned(phdr, Elf_Addr)) {
 		uprintf("Unaligned program headers\n");
 		return (ENOEXEC);
 	}
 
 	n = error = 0;
 	baddr = 0;
 	osrel = 0;
 	fctl0 = 0;
 	entry = proghdr = 0;
 	interp = NULL;
 	free_interp = false;
 	td = curthread;
 	maxalign = PAGE_SIZE;
 	mapsz = 0;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:
 			if (n == 0)
 				baddr = phdr[i].p_vaddr;
 			if (phdr[i].p_align > maxalign)
 				maxalign = phdr[i].p_align;
 			mapsz += phdr[i].p_memsz;
 			n++;
 
 			/*
 			 * If this segment contains the program headers,
 			 * remember their virtual address for the AT_PHDR
 			 * aux entry. Static binaries don't usually include
 			 * a PT_PHDR entry.
 			 */
 			if (phdr[i].p_offset == 0 &&
 			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
 				<= phdr[i].p_filesz)
 				proghdr = phdr[i].p_vaddr + hdr->e_phoff;
 			break;
 		case PT_INTERP:
 			/* Path to interpreter */
 			if (interp != NULL) {
 				uprintf("Multiple PT_INTERP headers\n");
 				error = ENOEXEC;
 				goto ret;
 			}
 			error = __elfN(get_interp)(imgp, &phdr[i], &interp,
 			    &free_interp);
 			if (error != 0)
 				goto ret;
 			break;
 		case PT_GNU_STACK:
 			if (__elfN(nxstack))
 				imgp->stack_prot =
 				    __elfN(trans_prot)(phdr[i].p_flags);
 			imgp->stack_sz = phdr[i].p_memsz;
 			break;
 		case PT_PHDR: 	/* Program header table info */
 			proghdr = phdr[i].p_vaddr;
 			break;
 		}
 	}
 
 	brand_info = __elfN(get_brandinfo)(imgp, interp, &osrel, &fctl0);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
 		error = ENOEXEC;
 		goto ret;
 	}
 	sv = brand_info->sysvec;
 	et_dyn_addr = 0;
 	if (hdr->e_type == ET_DYN) {
 		if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0) {
 			uprintf("Cannot execute shared object\n");
 			error = ENOEXEC;
 			goto ret;
 		}
 		/*
 		 * Honour the base load address from the dso if it is
 		 * non-zero for some reason.
 		 */
 		if (baddr == 0) {
 			if ((sv->sv_flags & SV_ASLR) == 0 ||
 			    (fctl0 & NT_FREEBSD_FCTL_ASLR_DISABLE) != 0)
-				et_dyn_addr = ET_DYN_LOAD_ADDR;
+				et_dyn_addr = __elfN(pie_base);
 			else if ((__elfN(pie_aslr_enabled) &&
 			    (imgp->proc->p_flag2 & P2_ASLR_DISABLE) == 0) ||
 			    (imgp->proc->p_flag2 & P2_ASLR_ENABLE) != 0)
 				et_dyn_addr = ET_DYN_ADDR_RAND;
 			else
-				et_dyn_addr = ET_DYN_LOAD_ADDR;
+				et_dyn_addr = __elfN(pie_base);
 		}
 	}
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 *
 	 * The VV_TEXT flag prevents modifications to the executable while
 	 * the vnode is unlocked.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	/*
 	 * Decide whether to enable randomization of user mappings.
 	 * First, reset user preferences for the setid binaries.
 	 * Then, account for the support of the randomization by the
 	 * ABI, by user preferences, and make special treatment for
 	 * PIE binaries.
 	 */
 	if (imgp->credential_setid) {
 		PROC_LOCK(imgp->proc);
 		imgp->proc->p_flag2 &= ~(P2_ASLR_ENABLE | P2_ASLR_DISABLE);
 		PROC_UNLOCK(imgp->proc);
 	}
 	if ((sv->sv_flags & SV_ASLR) == 0 ||
 	    (imgp->proc->p_flag2 & P2_ASLR_DISABLE) != 0 ||
 	    (fctl0 & NT_FREEBSD_FCTL_ASLR_DISABLE) != 0) {
 		KASSERT(et_dyn_addr != ET_DYN_ADDR_RAND,
 		    ("et_dyn_addr == RAND and !ASLR"));
 	} else if ((imgp->proc->p_flag2 & P2_ASLR_ENABLE) != 0 ||
 	    (__elfN(aslr_enabled) && hdr->e_type == ET_EXEC) ||
 	    et_dyn_addr == ET_DYN_ADDR_RAND) {
 		imgp->map_flags |= MAP_ASLR;
 		/*
 		 * If user does not care about sbrk, utilize the bss
 		 * grow region for mappings as well.  We can select
 		 * the base for the image anywere and still not suffer
 		 * from the fragmentation.
 		 */
 		if (!__elfN(aslr_honor_sbrk) ||
 		    (imgp->proc->p_flag2 & P2_ASLR_IGNSTART) != 0)
 			imgp->map_flags |= MAP_ASLR_IGNSTART;
 	}
 
 	error = exec_new_vmspace(imgp, sv);
 	vmspace = imgp->proc->p_vmspace;
 	map = &vmspace->vm_map;
 
 	imgp->proc->p_sysent = sv;
 
 	maxv = vm_map_max(map) - lim_max(td, RLIMIT_STACK);
 	if (et_dyn_addr == ET_DYN_ADDR_RAND) {
 		KASSERT((map->flags & MAP_ASLR) != 0,
 		    ("ET_DYN_ADDR_RAND but !MAP_ASLR"));
 		et_dyn_addr = __CONCAT(rnd_, __elfN(base))(map,
 		    vm_map_min(map) + mapsz + lim_max(td, RLIMIT_DATA),
 		    /* reserve half of the address space to interpreter */
 		    maxv / 2, 1UL << flsl(maxalign));
 	}
 
 	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	if (error != 0)
 		goto ret;
 
 	error = __elfN(load_sections)(imgp, hdr, phdr, et_dyn_addr, NULL);
 	if (error != 0)
 		goto ret;
 
 	error = __elfN(enforce_limits)(imgp, hdr, phdr, et_dyn_addr);
 	if (error != 0)
 		goto ret;
 
 	entry = (u_long)hdr->e_entry + et_dyn_addr;
 
 	/*
 	 * We load the dynamic linker where a userland call
 	 * to mmap(0, ...) would put it.  The rationale behind this
 	 * calculation is that it leaves room for the heap to grow to
 	 * its maximum allowed size.
 	 */
 	addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(td,
 	    RLIMIT_DATA));
 	if ((map->flags & MAP_ASLR) != 0) {
 		maxv1 = maxv / 2 + addr / 2;
 		MPASS(maxv1 >= addr);	/* No overflow */
 		map->anon_loc = __CONCAT(rnd_, __elfN(base))(map, addr, maxv1,
 		    MAXPAGESIZES > 1 ? pagesizes[1] : pagesizes[0]);
 	} else {
 		map->anon_loc = addr;
 	}
 
 	imgp->entry_addr = entry;
 
 	if (interp != NULL) {
 		VOP_UNLOCK(imgp->vp, 0);
 		if ((map->flags & MAP_ASLR) != 0) {
 			/* Assume that interpeter fits into 1/4 of AS */
 			maxv1 = maxv / 2 + addr / 2;
 			MPASS(maxv1 >= addr);	/* No overflow */
 			addr = __CONCAT(rnd_, __elfN(base))(map, addr,
 			    maxv1, PAGE_SIZE);
 		}
 		error = __elfN(load_interp)(imgp, brand_info, interp, &addr,
 		    &imgp->entry_addr);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (error != 0)
 			goto ret;
 	} else
 		addr = et_dyn_addr;
 
 	/*
 	 * Construct auxargs table (used by the fixup routine)
 	 */
 	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_NOWAIT);
 	if (elf_auxargs == NULL) {
 		VOP_UNLOCK(imgp->vp, 0);
 		elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	}
 	elf_auxargs->execfd = -1;
 	elf_auxargs->phdr = proghdr + et_dyn_addr;
 	elf_auxargs->phent = hdr->e_phentsize;
 	elf_auxargs->phnum = hdr->e_phnum;
 	elf_auxargs->pagesz = PAGE_SIZE;
 	elf_auxargs->base = addr;
 	elf_auxargs->flags = 0;
 	elf_auxargs->entry = entry;
 	elf_auxargs->hdr_eflags = hdr->e_flags;
 
 	imgp->auxargs = elf_auxargs;
 	imgp->interpreted = 0;
 	imgp->reloc_base = addr;
 	imgp->proc->p_osrel = osrel;
 	imgp->proc->p_fctl0 = fctl0;
 	imgp->proc->p_elf_machine = hdr->e_machine;
 	imgp->proc->p_elf_flags = hdr->e_flags;
 
 ret:
 	if (free_interp)
 		free(interp, M_TEMP);
 	return (error);
 }
 
 #define	suword __CONCAT(suword, __ELF_WORD_SIZE)
 
 int
 __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
 {
 	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
 	Elf_Auxinfo *argarray, *pos;
 	Elf_Addr *base, *auxbase;
 	int error;
 
 	base = (Elf_Addr *)*stack_base;
 	auxbase = base + imgp->args->argc + 1 + imgp->args->envc + 1;
 	argarray = pos = malloc(AT_COUNT * sizeof(*pos), M_TEMP,
 	    M_WAITOK | M_ZERO);
 
 	if (args->execfd != -1)
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_EHDRFLAGS, args->hdr_eflags);
 	if (imgp->execpathp != 0)
 		AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp);
 	AUXARGS_ENTRY(pos, AT_OSRELDATE,
 	    imgp->proc->p_ucred->cr_prison->pr_osreldate);
 	if (imgp->canary != 0) {
 		AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary);
 		AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen);
 	}
 	AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus);
 	if (imgp->pagesizes != 0) {
 		AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
 		AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
 	}
 	if (imgp->sysent->sv_timekeep_base != 0) {
 		AUXARGS_ENTRY(pos, AT_TIMEKEEP,
 		    imgp->sysent->sv_timekeep_base);
 	}
 	AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
 	    != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    imgp->sysent->sv_stackprot);
 	if (imgp->sysent->sv_hwcap != NULL)
 		AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap);
 	if (imgp->sysent->sv_hwcap2 != NULL)
 		AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 	KASSERT(pos - argarray <= AT_COUNT, ("Too many auxargs"));
 
 	error = copyout(argarray, auxbase, sizeof(*argarray) * AT_COUNT);
 	free(argarray, M_TEMP);
 	if (error != 0)
 		return (error);
 
 	base--;
 	if (suword(base, imgp->args->argc) == -1)
 		return (EFAULT);
 	*stack_base = (register_t *)base;
 	return (0);
 }
 
 /*
  * Code for generating ELF core dumps.
  */
 
 typedef void (*segment_callback)(vm_map_entry_t, void *);
 
 /* Closure for cb_put_phdr(). */
 struct phdr_closure {
 	Elf_Phdr *phdr;		/* Program header to fill in */
 	Elf_Off offset;		/* Offset of segment in core file */
 };
 
 /* Closure for cb_size_segment(). */
 struct sseg_closure {
 	int count;		/* Count of writable segments. */
 	size_t size;		/* Total size of all writable segments. */
 };
 
 typedef void (*outfunc_t)(void *, struct sbuf *, size_t *);
 
 struct note_info {
 	int		type;		/* Note type. */
 	outfunc_t 	outfunc; 	/* Output function. */
 	void		*outarg;	/* Argument for the output function. */
 	size_t		outsize;	/* Output size. */
 	TAILQ_ENTRY(note_info) link;	/* Link to the next note info. */
 };
 
 TAILQ_HEAD(note_info_list, note_info);
 
 /* Coredump output parameters. */
 struct coredump_params {
 	off_t		offset;
 	struct ucred	*active_cred;
 	struct ucred	*file_cred;
 	struct thread	*td;
 	struct vnode	*vp;
 	struct compressor *comp;
 };
 
 extern int compress_user_cores;
 extern int compress_user_cores_level;
 
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static int core_write(struct coredump_params *, const void *, size_t, off_t,
     enum uio_seg);
 static void each_dumpable_segment(struct thread *, segment_callback, void *);
 static int __elfN(corehdr)(struct coredump_params *, int, void *, size_t,
     struct note_info_list *, size_t);
 static void __elfN(prepare_notes)(struct thread *, struct note_info_list *,
     size_t *);
 static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t);
 static void __elfN(putnote)(struct note_info *, struct sbuf *);
 static size_t register_note(struct note_info_list *, int, outfunc_t, void *);
 static int sbuf_drain_core_output(void *, const char *, int);
 
 static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *);
 static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *);
 static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *);
 static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *);
 static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *);
 static void __elfN(note_ptlwpinfo)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *);
 static void note_procstat_files(void *, struct sbuf *, size_t *);
 static void note_procstat_groups(void *, struct sbuf *, size_t *);
 static void note_procstat_osrel(void *, struct sbuf *, size_t *);
 static void note_procstat_rlimit(void *, struct sbuf *, size_t *);
 static void note_procstat_umask(void *, struct sbuf *, size_t *);
 static void note_procstat_vmmap(void *, struct sbuf *, size_t *);
 
 /*
  * Write out a core segment to the compression stream.
  */
 static int
 compress_chunk(struct coredump_params *p, char *base, char *buf, u_int len)
 {
 	u_int chunk_len;
 	int error;
 
 	while (len > 0) {
 		chunk_len = MIN(len, CORE_BUF_SIZE);
 
 		/*
 		 * We can get EFAULT error here.
 		 * In that case zero out the current chunk of the segment.
 		 */
 		error = copyin(base, buf, chunk_len);
 		if (error != 0)
 			bzero(buf, chunk_len);
 		error = compressor_write(p->comp, buf, chunk_len);
 		if (error != 0)
 			break;
 		base += chunk_len;
 		len -= chunk_len;
 	}
 	return (error);
 }
 
 static int
 core_compressed_write(void *base, size_t len, off_t offset, void *arg)
 {
 
 	return (core_write((struct coredump_params *)arg, base, len, offset,
 	    UIO_SYSSPACE));
 }
 
 static int
 core_write(struct coredump_params *p, const void *base, size_t len,
     off_t offset, enum uio_seg seg)
 {
 
 	return (vn_rdwr_inchunks(UIO_WRITE, p->vp, __DECONST(void *, base),
 	    len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
 	    p->active_cred, p->file_cred, NULL, p->td));
 }
 
 static int
 core_output(void *base, size_t len, off_t offset, struct coredump_params *p,
     void *tmpbuf)
 {
 	int error;
 
 	if (p->comp != NULL)
 		return (compress_chunk(p, base, tmpbuf, len));
 
 	/*
 	 * EFAULT is a non-fatal error that we can get, for example,
 	 * if the segment is backed by a file but extends beyond its
 	 * end.
 	 */
 	error = core_write(p, base, len, offset, UIO_USERSPACE);
 	if (error == EFAULT) {
 		log(LOG_WARNING, "Failed to fully fault in a core file segment "
 		    "at VA %p with size 0x%zx to be written at offset 0x%jx "
 		    "for process %s\n", base, len, offset, curproc->p_comm);
 
 		/*
 		 * Write a "real" zero byte at the end of the target region
 		 * in the case this is the last segment.
 		 * The intermediate space will be implicitly zero-filled.
 		 */
 		error = core_write(p, zero_region, 1, offset + len - 1,
 		    UIO_SYSSPACE);
 	}
 	return (error);
 }
 
 /*
  * Drain into a core file.
  */
 static int
 sbuf_drain_core_output(void *arg, const char *data, int len)
 {
 	struct coredump_params *p;
 	int error, locked;
 
 	p = (struct coredump_params *)arg;
 
 	/*
 	 * Some kern_proc out routines that print to this sbuf may
 	 * call us with the process lock held. Draining with the
 	 * non-sleepable lock held is unsafe. The lock is needed for
 	 * those routines when dumping a live process. In our case we
 	 * can safely release the lock before draining and acquire
 	 * again after.
 	 */
 	locked = PROC_LOCKED(p->td->td_proc);
 	if (locked)
 		PROC_UNLOCK(p->td->td_proc);
 	if (p->comp != NULL)
 		error = compressor_write(p->comp, __DECONST(char *, data), len);
 	else
 		error = core_write(p, __DECONST(void *, data), len, p->offset,
 		    UIO_SYSSPACE);
 	if (locked)
 		PROC_LOCK(p->td->td_proc);
 	if (error != 0)
 		return (-error);
 	p->offset += len;
 	return (len);
 }
 
 int
 __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 {
 	struct ucred *cred = td->td_ucred;
 	int error = 0;
 	struct sseg_closure seginfo;
 	struct note_info_list notelst;
 	struct coredump_params params;
 	struct note_info *ninfo;
 	void *hdr, *tmpbuf;
 	size_t hdrsize, notesz, coresize;
 
 	hdr = NULL;
 	tmpbuf = NULL;
 	TAILQ_INIT(&notelst);
 
 	/* Size the program segments. */
 	seginfo.count = 0;
 	seginfo.size = 0;
 	each_dumpable_segment(td, cb_size_segment, &seginfo);
 
 	/*
 	 * Collect info about the core file header area.
 	 */
 	hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count);
 	if (seginfo.count + 1 >= PN_XNUM)
 		hdrsize += sizeof(Elf_Shdr);
 	__elfN(prepare_notes)(td, &notelst, &notesz);
 	coresize = round_page(hdrsize + notesz) + seginfo.size;
 
 	/* Set up core dump parameters. */
 	params.offset = 0;
 	params.active_cred = cred;
 	params.file_cred = NOCRED;
 	params.td = td;
 	params.vp = vp;
 	params.comp = NULL;
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		error = racct_add(td->td_proc, RACCT_CORE, coresize);
 		PROC_UNLOCK(td->td_proc);
 		if (error != 0) {
 			error = EFAULT;
 			goto done;
 		}
 	}
 #endif
 	if (coresize >= limit) {
 		error = EFAULT;
 		goto done;
 	}
 
 	/* Create a compression stream if necessary. */
 	if (compress_user_cores != 0) {
 		params.comp = compressor_init(core_compressed_write,
 		    compress_user_cores, CORE_BUF_SIZE,
 		    compress_user_cores_level, &params);
 		if (params.comp == NULL) {
 			error = EFAULT;
 			goto done;
 		}
 		tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
         }
 
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out following the notes.
 	 */
 	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
 	error = __elfN(corehdr)(&params, seginfo.count, hdr, hdrsize, &notelst,
 	    notesz);
 
 	/* Write the contents of all of the writable segments. */
 	if (error == 0) {
 		Elf_Phdr *php;
 		off_t offset;
 		int i;
 
 		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
 		offset = round_page(hdrsize + notesz);
 		for (i = 0; i < seginfo.count; i++) {
 			error = core_output((caddr_t)(uintptr_t)php->p_vaddr,
 			    php->p_filesz, offset, &params, tmpbuf);
 			if (error != 0)
 				break;
 			offset += php->p_filesz;
 			php++;
 		}
 		if (error == 0 && params.comp != NULL)
 			error = compressor_flush(params.comp);
 	}
 	if (error) {
 		log(LOG_WARNING,
 		    "Failed to write core file for process %s (error %d)\n",
 		    curproc->p_comm, error);
 	}
 
 done:
 	free(tmpbuf, M_TEMP);
 	if (params.comp != NULL)
 		compressor_fini(params.comp);
 	while ((ninfo = TAILQ_FIRST(&notelst)) != NULL) {
 		TAILQ_REMOVE(&notelst, ninfo, link);
 		free(ninfo, M_TEMP);
 	}
 	if (hdr != NULL)
 		free(hdr, M_TEMP);
 
 	return (error);
 }
 
 /*
  * A callback for each_dumpable_segment() to write out the segment's
  * program header entry.
  */
 static void
 cb_put_phdr(vm_map_entry_t entry, void *closure)
 {
 	struct phdr_closure *phc = (struct phdr_closure *)closure;
 	Elf_Phdr *phdr = phc->phdr;
 
 	phc->offset = round_page(phc->offset);
 
 	phdr->p_type = PT_LOAD;
 	phdr->p_offset = phc->offset;
 	phdr->p_vaddr = entry->start;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
 	phdr->p_align = PAGE_SIZE;
 	phdr->p_flags = __elfN(untrans_prot)(entry->protection);
 
 	phc->offset += phdr->p_filesz;
 	phc->phdr++;
 }
 
 /*
  * A callback for each_dumpable_segment() to gather information about
  * the number of segments and their total size.
  */
 static void
 cb_size_segment(vm_map_entry_t entry, void *closure)
 {
 	struct sseg_closure *ssc = (struct sseg_closure *)closure;
 
 	ssc->count++;
 	ssc->size += entry->end - entry->start;
 }
 
 /*
  * For each writable segment in the process's memory map, call the given
  * function with a pointer to the map entry and some arbitrary
  * caller-supplied data.
  */
 static void
 each_dumpable_segment(struct thread *td, segment_callback func, void *closure)
 {
 	struct proc *p = td->td_proc;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, object;
 	boolean_t ignore_entry;
 
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		/*
 		 * Don't dump inaccessible mappings, deal with legacy
 		 * coredump mode.
 		 *
 		 * Note that read-only segments related to the elf binary
 		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
 		 * need to arbitrarily ignore such segments.
 		 */
 		if (elf_legacy_coredump) {
 			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
 				continue;
 		} else {
 			if ((entry->protection & VM_PROT_ALL) == 0)
 				continue;
 		}
 
 		/*
 		 * Dont include memory segment in the coredump if
 		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
 		 * madvise(2).  Do not dump submaps (i.e. parts of the
 		 * kernel map).
 		 */
 		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
 			continue;
 
 		if ((object = entry->object.vm_object) == NULL)
 			continue;
 
 		/* Ignore memory-mapped devices and such things. */
 		VM_OBJECT_RLOCK(object);
 		while ((backing_object = object->backing_object) != NULL) {
 			VM_OBJECT_RLOCK(backing_object);
 			VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
 		ignore_entry = object->type != OBJT_DEFAULT &&
 		    object->type != OBJT_SWAP && object->type != OBJT_VNODE &&
 		    object->type != OBJT_PHYS;
 		VM_OBJECT_RUNLOCK(object);
 		if (ignore_entry)
 			continue;
 
 		(*func)(entry, closure);
 	}
 	vm_map_unlock_read(map);
 }
 
 /*
  * Write the core file header to the file, including padding up to
  * the page boundary.
  */
 static int
 __elfN(corehdr)(struct coredump_params *p, int numsegs, void *hdr,
     size_t hdrsize, struct note_info_list *notelst, size_t notesz)
 {
 	struct note_info *ninfo;
 	struct sbuf *sb;
 	int error;
 
 	/* Fill in the header. */
 	bzero(hdr, hdrsize);
 	__elfN(puthdr)(p->td, hdr, hdrsize, numsegs, notesz);
 
 	sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN);
 	sbuf_set_drain(sb, sbuf_drain_core_output, p);
 	sbuf_start_section(sb, NULL);
 	sbuf_bcat(sb, hdr, hdrsize);
 	TAILQ_FOREACH(ninfo, notelst, link)
 	    __elfN(putnote)(ninfo, sb);
 	/* Align up to a page boundary for the program segments. */
 	sbuf_end_section(sb, -1, PAGE_SIZE, 0);
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static void
 __elfN(prepare_notes)(struct thread *td, struct note_info_list *list,
     size_t *sizep)
 {
 	struct proc *p;
 	struct thread *thr;
 	size_t size;
 
 	p = td->td_proc;
 	size = 0;
 
 	size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p);
 
 	/*
 	 * To have the debugger select the right thread (LWP) as the initial
 	 * thread, we dump the state of the thread passed to us in td first.
 	 * This is the thread that causes the core dump and thus likely to
 	 * be the right thread one wants to have selected in the debugger.
 	 */
 	thr = td;
 	while (thr != NULL) {
 		size += register_note(list, NT_PRSTATUS,
 		    __elfN(note_prstatus), thr);
 		size += register_note(list, NT_FPREGSET,
 		    __elfN(note_fpregset), thr);
 		size += register_note(list, NT_THRMISC,
 		    __elfN(note_thrmisc), thr);
 		size += register_note(list, NT_PTLWPINFO,
 		    __elfN(note_ptlwpinfo), thr);
 		size += register_note(list, -1,
 		    __elfN(note_threadmd), thr);
 
 		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
 		    TAILQ_NEXT(thr, td_plist);
 		if (thr == td)
 			thr = TAILQ_NEXT(thr, td_plist);
 	}
 
 	size += register_note(list, NT_PROCSTAT_PROC,
 	    __elfN(note_procstat_proc), p);
 	size += register_note(list, NT_PROCSTAT_FILES,
 	    note_procstat_files, p);
 	size += register_note(list, NT_PROCSTAT_VMMAP,
 	    note_procstat_vmmap, p);
 	size += register_note(list, NT_PROCSTAT_GROUPS,
 	    note_procstat_groups, p);
 	size += register_note(list, NT_PROCSTAT_UMASK,
 	    note_procstat_umask, p);
 	size += register_note(list, NT_PROCSTAT_RLIMIT,
 	    note_procstat_rlimit, p);
 	size += register_note(list, NT_PROCSTAT_OSREL,
 	    note_procstat_osrel, p);
 	size += register_note(list, NT_PROCSTAT_PSSTRINGS,
 	    __elfN(note_procstat_psstrings), p);
 	size += register_note(list, NT_PROCSTAT_AUXV,
 	    __elfN(note_procstat_auxv), p);
 
 	*sizep = size;
 }
 
 static void
 __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
     size_t notesz)
 {
 	Elf_Ehdr *ehdr;
 	Elf_Phdr *phdr;
 	Elf_Shdr *shdr;
 	struct phdr_closure phc;
 
 	ehdr = (Elf_Ehdr *)hdr;
 
 	ehdr->e_ident[EI_MAG0] = ELFMAG0;
 	ehdr->e_ident[EI_MAG1] = ELFMAG1;
 	ehdr->e_ident[EI_MAG2] = ELFMAG2;
 	ehdr->e_ident[EI_MAG3] = ELFMAG3;
 	ehdr->e_ident[EI_CLASS] = ELF_CLASS;
 	ehdr->e_ident[EI_DATA] = ELF_DATA;
 	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
 	ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
 	ehdr->e_ident[EI_ABIVERSION] = 0;
 	ehdr->e_ident[EI_PAD] = 0;
 	ehdr->e_type = ET_CORE;
 	ehdr->e_machine = td->td_proc->p_elf_machine;
 	ehdr->e_version = EV_CURRENT;
 	ehdr->e_entry = 0;
 	ehdr->e_phoff = sizeof(Elf_Ehdr);
 	ehdr->e_flags = td->td_proc->p_elf_flags;
 	ehdr->e_ehsize = sizeof(Elf_Ehdr);
 	ehdr->e_phentsize = sizeof(Elf_Phdr);
 	ehdr->e_shentsize = sizeof(Elf_Shdr);
 	ehdr->e_shstrndx = SHN_UNDEF;
 	if (numsegs + 1 < PN_XNUM) {
 		ehdr->e_phnum = numsegs + 1;
 		ehdr->e_shnum = 0;
 	} else {
 		ehdr->e_phnum = PN_XNUM;
 		ehdr->e_shnum = 1;
 
 		ehdr->e_shoff = ehdr->e_phoff +
 		    (numsegs + 1) * ehdr->e_phentsize;
 		KASSERT(ehdr->e_shoff == hdrsize - sizeof(Elf_Shdr),
 		    ("e_shoff: %zu, hdrsize - shdr: %zu",
 		     (size_t)ehdr->e_shoff, hdrsize - sizeof(Elf_Shdr)));
 
 		shdr = (Elf_Shdr *)((char *)hdr + ehdr->e_shoff);
 		memset(shdr, 0, sizeof(*shdr));
 		/*
 		 * A special first section is used to hold large segment and
 		 * section counts.  This was proposed by Sun Microsystems in
 		 * Solaris and has been adopted by Linux; the standard ELF
 		 * tools are already familiar with the technique.
 		 *
 		 * See table 7-7 of the Solaris "Linker and Libraries Guide"
 		 * (or 12-7 depending on the version of the document) for more
 		 * details.
 		 */
 		shdr->sh_type = SHT_NULL;
 		shdr->sh_size = ehdr->e_shnum;
 		shdr->sh_link = ehdr->e_shstrndx;
 		shdr->sh_info = numsegs + 1;
 	}
 
 	/*
 	 * Fill in the program header entries.
 	 */
 	phdr = (Elf_Phdr *)((char *)hdr + ehdr->e_phoff);
 
 	/* The note segement. */
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = hdrsize;
 	phdr->p_vaddr = 0;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = notesz;
 	phdr->p_memsz = 0;
 	phdr->p_flags = PF_R;
 	phdr->p_align = ELF_NOTE_ROUNDSIZE;
 	phdr++;
 
 	/* All the writable segments from the program. */
 	phc.phdr = phdr;
 	phc.offset = round_page(hdrsize + notesz);
 	each_dumpable_segment(td, cb_put_phdr, &phc);
 }
 
 static size_t
 register_note(struct note_info_list *list, int type, outfunc_t out, void *arg)
 {
 	struct note_info *ninfo;
 	size_t size, notesize;
 
 	size = 0;
 	out(arg, NULL, &size);
 	ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK);
 	ninfo->type = type;
 	ninfo->outfunc = out;
 	ninfo->outarg = arg;
 	ninfo->outsize = size;
 	TAILQ_INSERT_TAIL(list, ninfo, link);
 
 	if (type == -1)
 		return (size);
 
 	notesize = sizeof(Elf_Note) +		/* note header */
 	    roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
 						/* note name */
 	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
 
 	return (notesize);
 }
 
 static size_t
 append_note_data(const void *src, void *dst, size_t len)
 {
 	size_t padded_len;
 
 	padded_len = roundup2(len, ELF_NOTE_ROUNDSIZE);
 	if (dst != NULL) {
 		bcopy(src, dst, len);
 		bzero((char *)dst + len, padded_len - len);
 	}
 	return (padded_len);
 }
 
 size_t
 __elfN(populate_note)(int type, void *src, void *dst, size_t size, void **descp)
 {
 	Elf_Note *note;
 	char *buf;
 	size_t notesize;
 
 	buf = dst;
 	if (buf != NULL) {
 		note = (Elf_Note *)buf;
 		note->n_namesz = sizeof(FREEBSD_ABI_VENDOR);
 		note->n_descsz = size;
 		note->n_type = type;
 		buf += sizeof(*note);
 		buf += append_note_data(FREEBSD_ABI_VENDOR, buf,
 		    sizeof(FREEBSD_ABI_VENDOR));
 		append_note_data(src, buf, size);
 		if (descp != NULL)
 			*descp = buf;
 	}
 
 	notesize = sizeof(Elf_Note) +		/* note header */
 	    roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
 						/* note name */
 	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
 
 	return (notesize);
 }
 
 static void
 __elfN(putnote)(struct note_info *ninfo, struct sbuf *sb)
 {
 	Elf_Note note;
 	ssize_t old_len, sect_len;
 	size_t new_len, descsz, i;
 
 	if (ninfo->type == -1) {
 		ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
 		return;
 	}
 
 	note.n_namesz = sizeof(FREEBSD_ABI_VENDOR);
 	note.n_descsz = ninfo->outsize;
 	note.n_type = ninfo->type;
 
 	sbuf_bcat(sb, &note, sizeof(note));
 	sbuf_start_section(sb, &old_len);
 	sbuf_bcat(sb, FREEBSD_ABI_VENDOR, sizeof(FREEBSD_ABI_VENDOR));
 	sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
 	if (note.n_descsz == 0)
 		return;
 	sbuf_start_section(sb, &old_len);
 	ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
 	sect_len = sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
 	if (sect_len < 0)
 		return;
 
 	new_len = (size_t)sect_len;
 	descsz = roundup(note.n_descsz, ELF_NOTE_ROUNDSIZE);
 	if (new_len < descsz) {
 		/*
 		 * It is expected that individual note emitters will correctly
 		 * predict their expected output size and fill up to that size
 		 * themselves, padding in a format-specific way if needed.
 		 * However, in case they don't, just do it here with zeros.
 		 */
 		for (i = 0; i < descsz - new_len; i++)
 			sbuf_putc(sb, 0);
 	} else if (new_len > descsz) {
 		/*
 		 * We can't always truncate sb -- we may have drained some
 		 * of it already.
 		 */
 		KASSERT(new_len == descsz, ("%s: Note type %u changed as we "
 		    "read it (%zu > %zu).  Since it is longer than "
 		    "expected, this coredump's notes are corrupt.  THIS "
 		    "IS A BUG in the note_procstat routine for type %u.\n",
 		    __func__, (unsigned)note.n_type, new_len, descsz,
 		    (unsigned)note.n_type));
 	}
 }
 
 /*
  * Miscellaneous note out functions.
  */
 
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 
 typedef struct prstatus32 elf_prstatus_t;
 typedef struct prpsinfo32 elf_prpsinfo_t;
 typedef struct fpreg32 elf_prfpregset_t;
 typedef struct fpreg32 elf_fpregset_t;
 typedef struct reg32 elf_gregset_t;
 typedef struct thrmisc32 elf_thrmisc_t;
 #define ELF_KERN_PROC_MASK	KERN_PROC_MASK32
 typedef struct kinfo_proc32 elf_kinfo_proc_t;
 typedef uint32_t elf_ps_strings_t;
 #else
 typedef prstatus_t elf_prstatus_t;
 typedef prpsinfo_t elf_prpsinfo_t;
 typedef prfpregset_t elf_prfpregset_t;
 typedef prfpregset_t elf_fpregset_t;
 typedef gregset_t elf_gregset_t;
 typedef thrmisc_t elf_thrmisc_t;
 #define ELF_KERN_PROC_MASK	0
 typedef struct kinfo_proc elf_kinfo_proc_t;
 typedef vm_offset_t elf_ps_strings_t;
 #endif
 
 static void
 __elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct sbuf sbarg;
 	size_t len;
 	char *cp, *end;
 	struct proc *p;
 	elf_prpsinfo_t *psinfo;
 	int error;
 
 	p = (struct proc *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*psinfo), ("invalid size"));
 		psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK);
 		psinfo->pr_version = PRPSINFO_VERSION;
 		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
 		strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
 		PROC_LOCK(p);
 		if (p->p_args != NULL) {
 			len = sizeof(psinfo->pr_psargs) - 1;
 			if (len > p->p_args->ar_length)
 				len = p->p_args->ar_length;
 			memcpy(psinfo->pr_psargs, p->p_args->ar_args, len);
 			PROC_UNLOCK(p);
 			error = 0;
 		} else {
 			_PHOLD(p);
 			PROC_UNLOCK(p);
 			sbuf_new(&sbarg, psinfo->pr_psargs,
 			    sizeof(psinfo->pr_psargs), SBUF_FIXEDLEN);
 			error = proc_getargv(curthread, p, &sbarg);
 			PRELE(p);
 			if (sbuf_finish(&sbarg) == 0)
 				len = sbuf_len(&sbarg) - 1;
 			else
 				len = sizeof(psinfo->pr_psargs) - 1;
 			sbuf_delete(&sbarg);
 		}
 		if (error || len == 0)
 			strlcpy(psinfo->pr_psargs, p->p_comm,
 			    sizeof(psinfo->pr_psargs));
 		else {
 			KASSERT(len < sizeof(psinfo->pr_psargs),
 			    ("len is too long: %zu vs %zu", len,
 			    sizeof(psinfo->pr_psargs)));
 			cp = psinfo->pr_psargs;
 			end = cp + len - 1;
 			for (;;) {
 				cp = memchr(cp, '\0', end - cp);
 				if (cp == NULL)
 					break;
 				*cp = ' ';
 			}
 		}
 		psinfo->pr_pid = p->p_pid;
 		sbuf_bcat(sb, psinfo, sizeof(*psinfo));
 		free(psinfo, M_TEMP);
 	}
 	*sizep = sizeof(*psinfo);
 }
 
 static void
 __elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_prstatus_t *status;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*status), ("invalid size"));
 		status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK);
 		status->pr_version = PRSTATUS_VERSION;
 		status->pr_statussz = sizeof(elf_prstatus_t);
 		status->pr_gregsetsz = sizeof(elf_gregset_t);
 		status->pr_fpregsetsz = sizeof(elf_fpregset_t);
 		status->pr_osreldate = osreldate;
 		status->pr_cursig = td->td_proc->p_sig;
 		status->pr_pid = td->td_tid;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		fill_regs32(td, &status->pr_reg);
 #else
 		fill_regs(td, &status->pr_reg);
 #endif
 		sbuf_bcat(sb, status, sizeof(*status));
 		free(status, M_TEMP);
 	}
 	*sizep = sizeof(*status);
 }
 
 static void
 __elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_prfpregset_t *fpregset;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*fpregset), ("invalid size"));
 		fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK);
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		fill_fpregs32(td, fpregset);
 #else
 		fill_fpregs(td, fpregset);
 #endif
 		sbuf_bcat(sb, fpregset, sizeof(*fpregset));
 		free(fpregset, M_TEMP);
 	}
 	*sizep = sizeof(*fpregset);
 }
 
 static void
 __elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_thrmisc_t thrmisc;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(thrmisc), ("invalid size"));
 		bzero(&thrmisc._pad, sizeof(thrmisc._pad));
 		strcpy(thrmisc.pr_tname, td->td_name);
 		sbuf_bcat(sb, &thrmisc, sizeof(thrmisc));
 	}
 	*sizep = sizeof(thrmisc);
 }
 
 static void
 __elfN(note_ptlwpinfo)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	size_t size;
 	int structsize;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 	struct ptrace_lwpinfo32 pl;
 #else
 	struct ptrace_lwpinfo pl;
 #endif
 
 	td = (struct thread *)arg;
 	size = sizeof(structsize) + sizeof(pl);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(pl);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		bzero(&pl, sizeof(pl));
 		pl.pl_lwpid = td->td_tid;
 		pl.pl_event = PL_EVENT_NONE;
 		pl.pl_sigmask = td->td_sigmask;
 		pl.pl_siglist = td->td_siglist;
 		if (td->td_si.si_signo != 0) {
 			pl.pl_event = PL_EVENT_SIGNAL;
 			pl.pl_flags |= PL_FLAG_SI;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 			siginfo_to_siginfo32(&td->td_si, &pl.pl_siginfo);
 #else
 			pl.pl_siginfo = td->td_si;
 #endif
 		}
 		strcpy(pl.pl_tdname, td->td_name);
 		/* XXX TODO: supply more information in struct ptrace_lwpinfo*/
 		sbuf_bcat(sb, &pl, sizeof(pl));
 	}
 	*sizep = size;
 }
 
 /*
  * Allow for MD specific notes, as well as any MD
  * specific preparations for writing MI notes.
  */
 static void
 __elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	void *buf;
 	size_t size;
 
 	td = (struct thread *)arg;
 	size = *sizep;
 	if (size != 0 && sb != NULL)
 		buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
 	else
 		buf = NULL;
 	size = 0;
 	__elfN(dump_thread)(td, buf, &size);
 	KASSERT(sb == NULL || *sizep == size, ("invalid size"));
 	if (size != 0 && sb != NULL)
 		sbuf_bcat(sb, buf, size);
 	free(buf, M_TEMP);
 	*sizep = size;
 }
 
 #ifdef KINFO_PROC_SIZE
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 #endif
 
 static void
 __elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + p->p_numthreads *
 	    sizeof(elf_kinfo_proc_t);
 
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(elf_kinfo_proc_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
 	}
 	*sizep = size;
 }
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static void
 note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size, sect_sz, i;
 	ssize_t start_len, sect_len;
 	int structsize, filedesc_flags;
 
 	if (coredump_pack_fileinfo)
 		filedesc_flags = KERN_FILEDESC_PACK_KINFO;
 	else
 		filedesc_flags = 0;
 
 	p = (struct proc *)arg;
 	structsize = sizeof(struct kinfo_file);
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_count_drain, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, -1, filedesc_flags);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		sbuf_start_section(sb, &start_len);
 
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, *sizep - sizeof(structsize),
 		    filedesc_flags);
 
 		sect_len = sbuf_end_section(sb, start_len, 0, 0);
 		if (sect_len < 0)
 			return;
 		sect_sz = sect_len;
 
 		KASSERT(sect_sz <= *sizep,
 		    ("kern_proc_filedesc_out did not respect maxlen; "
 		     "requested %zu, got %zu", *sizep - sizeof(structsize),
 		     sect_sz - sizeof(structsize)));
 
 		for (i = 0; i < *sizep - sect_sz && sb->s_error == 0; i++)
 			sbuf_putc(sb, 0);
 	}
 }
 
 #ifdef KINFO_VMENTRY_SIZE
 CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
 #endif
 
 static void
 note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize, vmmap_flags;
 
 	if (coredump_pack_vmmapinfo)
 		vmmap_flags = KERN_VMMAP_PACK_KINFO;
 	else
 		vmmap_flags = 0;
 
 	p = (struct proc *)arg;
 	structsize = sizeof(struct kinfo_vmentry);
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_count_drain, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_vmmap_out(p, sb, -1, vmmap_flags);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_vmmap_out(p, sb, *sizep - sizeof(structsize),
 		    vmmap_flags);
 	}
 }
 
 static void
 note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(gid_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups *
 		    sizeof(gid_t));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(p->p_fd->fd_cmask);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	struct rlimit rlim[RLIM_NLIMITS];
 	size_t size;
 	int structsize, i;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(rlim);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(rlim);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		for (i = 0; i < RLIM_NLIMITS; i++)
 			lim_rlimit_proc(p, i, &rlim[i]);
 		PROC_UNLOCK(p);
 		sbuf_bcat(sb, rlim, sizeof(rlim));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(p->p_osrel);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(p->p_osrel);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel));
 	}
 	*sizep = size;
 }
 
 static void
 __elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	elf_ps_strings_t ps_strings;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(ps_strings);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(ps_strings);
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		ps_strings = PTROUT(p->p_sysent->sv_psstrings);
 #else
 		ps_strings = p->p_sysent->sv_psstrings;
 #endif
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &ps_strings, sizeof(ps_strings));
 	}
 	*sizep = size;
 }
 
 static void
 __elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_count_drain, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PHOLD(p);
 		proc_getauxv(curthread, p, sb);
 		PRELE(p);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		structsize = sizeof(Elf_Auxinfo);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PHOLD(p);
 		proc_getauxv(curthread, p, sb);
 		PRELE(p);
 	}
 }
 
 static boolean_t
 __elfN(parse_notes)(struct image_params *imgp, Elf_Note *checknote,
     const char *note_vendor, const Elf_Phdr *pnote,
     boolean_t (*cb)(const Elf_Note *, void *, boolean_t *), void *cb_arg)
 {
 	const Elf_Note *note, *note0, *note_end;
 	const char *note_name;
 	char *buf;
 	int i, error;
 	boolean_t res;
 
 	/* We need some limit, might as well use PAGE_SIZE. */
 	if (pnote == NULL || pnote->p_filesz > PAGE_SIZE)
 		return (FALSE);
 	ASSERT_VOP_LOCKED(imgp->vp, "parse_notes");
 	if (pnote->p_offset > PAGE_SIZE ||
 	    pnote->p_filesz > PAGE_SIZE - pnote->p_offset) {
 		buf = malloc(pnote->p_filesz, M_TEMP, M_NOWAIT);
 		if (buf == NULL) {
 			VOP_UNLOCK(imgp->vp, 0);
 			buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK);
 			vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		}
 		error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz,
 		    pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED,
 		    curthread->td_ucred, NOCRED, NULL, curthread);
 		if (error != 0) {
 			uprintf("i/o error PT_NOTE\n");
 			goto retf;
 		}
 		note = note0 = (const Elf_Note *)buf;
 		note_end = (const Elf_Note *)(buf + pnote->p_filesz);
 	} else {
 		note = note0 = (const Elf_Note *)(imgp->image_header +
 		    pnote->p_offset);
 		note_end = (const Elf_Note *)(imgp->image_header +
 		    pnote->p_offset + pnote->p_filesz);
 		buf = NULL;
 	}
 	for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
 		if (!aligned(note, Elf32_Addr) || (const char *)note_end -
 		    (const char *)note < sizeof(Elf_Note)) {
 			goto retf;
 		}
 		if (note->n_namesz != checknote->n_namesz ||
 		    note->n_descsz != checknote->n_descsz ||
 		    note->n_type != checknote->n_type)
 			goto nextnote;
 		note_name = (const char *)(note + 1);
 		if (note_name + checknote->n_namesz >=
 		    (const char *)note_end || strncmp(note_vendor,
 		    note_name, checknote->n_namesz) != 0)
 			goto nextnote;
 
 		if (cb(note, cb_arg, &res))
 			goto ret;
 nextnote:
 		note = (const Elf_Note *)((const char *)(note + 1) +
 		    roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
 		    roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE));
 	}
 retf:
 	res = FALSE;
 ret:
 	free(buf, M_TEMP);
 	return (res);
 }
 
 struct brandnote_cb_arg {
 	Elf_Brandnote *brandnote;
 	int32_t *osrel;
 };
 
 static boolean_t
 brandnote_cb(const Elf_Note *note, void *arg0, boolean_t *res)
 {
 	struct brandnote_cb_arg *arg;
 
 	arg = arg0;
 
 	/*
 	 * Fetch the osreldate for binary from the ELF OSABI-note if
 	 * necessary.
 	 */
 	*res = (arg->brandnote->flags & BN_TRANSLATE_OSREL) != 0 &&
 	    arg->brandnote->trans_osrel != NULL ?
 	    arg->brandnote->trans_osrel(note, arg->osrel) : TRUE;
 
 	return (TRUE);
 }
 
 static Elf_Note fctl_note = {
 	.n_namesz = sizeof(FREEBSD_ABI_VENDOR),
 	.n_descsz = sizeof(uint32_t),
 	.n_type = NT_FREEBSD_FEATURE_CTL,
 };
 
 struct fctl_cb_arg {
 	uint32_t *fctl0;
 };
 
 static boolean_t
 note_fctl_cb(const Elf_Note *note, void *arg0, boolean_t *res)
 {
 	struct fctl_cb_arg *arg;
 	const Elf32_Word *desc;
 	uintptr_t p;
 
 	arg = arg0;
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 	desc = (const Elf32_Word *)p;
 	*arg->fctl0 = desc[0];
 	return (TRUE);
 }
 
 /*
  * Try to find the appropriate ABI-note section for checknote, fetch
  * the osreldate and feature control flags for binary from the ELF
  * OSABI-note.  Only the first page of the image is searched, the same
  * as for headers.
  */
 static boolean_t
 __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
     int32_t *osrel, uint32_t *fctl0)
 {
 	const Elf_Phdr *phdr;
 	const Elf_Ehdr *hdr;
 	struct brandnote_cb_arg b_arg;
 	struct fctl_cb_arg f_arg;
 	int i, j;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	b_arg.brandnote = brandnote;
 	b_arg.osrel = osrel;
 	f_arg.fctl0 = fctl0;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_NOTE && __elfN(parse_notes)(imgp,
 		    &brandnote->hdr, brandnote->vendor, &phdr[i], brandnote_cb,
 		    &b_arg)) {
 			for (j = 0; j < hdr->e_phnum; j++) {
 				if (phdr[j].p_type == PT_NOTE &&
 				    __elfN(parse_notes)(imgp, &fctl_note,
 				    FREEBSD_ABI_VENDOR, &phdr[j],
 				    note_fctl_cb, &f_arg))
 					break;
 			}
 			return (TRUE);
 		}
 	}
 	return (FALSE);
 
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw __elfN(execsw) = {
 	.ex_imgact = __CONCAT(exec_, __elfN(imgact)),
 	.ex_name = __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
 };
 EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
 
 static vm_prot_t
 __elfN(trans_prot)(Elf_Word flags)
 {
 	vm_prot_t prot;
 
 	prot = 0;
 	if (flags & PF_X)
 		prot |= VM_PROT_EXECUTE;
 	if (flags & PF_W)
 		prot |= VM_PROT_WRITE;
 	if (flags & PF_R)
 		prot |= VM_PROT_READ;
 #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__))
 	if (i386_read_exec && (flags & PF_R))
 		prot |= VM_PROT_EXECUTE;
 #endif
 	return (prot);
 }
 
 static Elf_Word
 __elfN(untrans_prot)(vm_prot_t prot)
 {
 	Elf_Word flags;
 
 	flags = 0;
 	if (prot & VM_PROT_EXECUTE)
 		flags |= PF_X;
 	if (prot & VM_PROT_READ)
 		flags |= PF_R;
 	if (prot & VM_PROT_WRITE)
 		flags |= PF_W;
 	return (flags);
 }
 
 void
 __elfN(stackgap)(struct image_params *imgp, u_long *stack_base)
 {
 	u_long range, rbase, gap;
 	int pct;
 
 	if ((imgp->map_flags & MAP_ASLR) == 0)
 		return;
 	pct = __elfN(aslr_stack_gap);
 	if (pct == 0)
 		return;
 	if (pct > 50)
 		pct = 50;
 	range = imgp->eff_stack_sz * pct / 100;
 	arc4rand(&rbase, sizeof(rbase), 0);
 	gap = rbase % range;
 	gap &= ~(sizeof(u_long) - 1);
 	*stack_base -= gap;
 }
Index: projects/clang900-import/sys/kern/kern_timeout.c
===================================================================
--- projects/clang900-import/sys/kern/kern_timeout.c	(revision 352586)
+++ projects/clang900-import/sys/kern/kern_timeout.c	(revision 352587)
@@ -1,1718 +1,1720 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_callout_profiling.h"
 #include "opt_ddb.h"
 #if defined(__arm__)
 #include "opt_timer.h"
 #endif
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
+#include <sys/domainset.h>
 #include <sys/file.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #include <machine/_inttypes.h>
 #endif
 
 #ifdef SMP
 #include <machine/cpu.h>
 #endif
 
 #ifndef NO_EVENTTIMERS
 DPCPU_DECLARE(sbintime_t, hardclocktime);
 #endif
 
 SDT_PROVIDER_DEFINE(callout_execute);
 SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
 SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
 
 #ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
 static int avg_gcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
     "Average number of Giant callouts made per softclock call. Units = 1/1000");
 static int avg_lockcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
     "Average number of lock callouts made per softclock call. Units = 1/1000");
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
 static int avg_depth_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
     "Average number of direct callouts examined per callout_process call. "
     "Units = 1/1000");
 static int avg_lockcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
     "callout_process call. Units = 1/1000");
 static int avg_mpcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
     0, "Average number of MP direct callouts made per callout_process call. "
     "Units = 1/1000");
 #endif
 
 static int ncallout;
 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
     "Number of entries in callwheel and size of timeout() preallocation");
 
 #ifdef	RSS
 static int pin_default_swi = 1;
 static int pin_pcpu_swi = 1;
 #else
 static int pin_default_swi = 0;
 static int pin_pcpu_swi = 0;
 #endif
 
 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
 
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
  */
 u_int callwheelsize, callwheelmask;
 
 /*
  * The callout cpu exec entities represent informations necessary for
  * describing the state of callouts currently running on the CPU and the ones
  * necessary for migrating callouts to the new callout cpu. In particular,
  * the first entry of the array cc_exec_entity holds informations for callout
  * running in SWI thread context, while the second one holds informations
  * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
 struct cc_exec {
 	struct callout		*cc_curr;
 	void			(*cc_drain)(void *);
 	void			*cc_last_func;
 	void			*cc_last_arg;
 #ifdef SMP
 	void			(*ce_migration_func)(void *);
 	void			*ce_migration_arg;
 	sbintime_t		ce_migration_time;
 	sbintime_t		ce_migration_prec;
 	int			ce_migration_cpu;
 #endif
 	bool			cc_cancel;
 	bool			cc_waiting;
 };
 
 /*
  * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
 	struct callout		*cc_next;
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
 	struct callout_slist	cc_callfree;
 	sbintime_t		cc_firstevent;
 	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
 	u_int			cc_bucket;
 	u_int			cc_inited;
 	char			cc_ktr_event_name[20];
 };
 
 #define	callout_migrating(c)	((c)->c_iflags & CALLOUT_DFRMIGRATION)
 
 #define	cc_exec_curr(cc, dir)		cc->cc_exec_entity[dir].cc_curr
 #define	cc_exec_last_func(cc, dir)	cc->cc_exec_entity[dir].cc_last_func
 #define	cc_exec_last_arg(cc, dir)	cc->cc_exec_entity[dir].cc_last_arg
 #define	cc_exec_drain(cc, dir)		cc->cc_exec_entity[dir].cc_drain
 #define	cc_exec_next(cc)		cc->cc_next
 #define	cc_exec_cancel(cc, dir)		cc->cc_exec_entity[dir].cc_cancel
 #define	cc_exec_waiting(cc, dir)	cc->cc_exec_entity[dir].cc_waiting
 #ifdef SMP
 #define	cc_migration_func(cc, dir)	cc->cc_exec_entity[dir].ce_migration_func
 #define	cc_migration_arg(cc, dir)	cc->cc_exec_entity[dir].ce_migration_arg
 #define	cc_migration_cpu(cc, dir)	cc->cc_exec_entity[dir].ce_migration_cpu
 #define	cc_migration_time(cc, dir)	cc->cc_exec_entity[dir].ce_migration_time
 #define	cc_migration_prec(cc, dir)	cc->cc_exec_entity[dir].ce_migration_prec
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
 #define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
 #else
 struct callout_cpu cc_cpu;
 #define	CC_CPU(cpu)	&cc_cpu
 #define	CC_SELF()	&cc_cpu
 #endif
 #define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
 static int timeout_cpu;
 
 static void	callout_cpu_init(struct callout_cpu *cc, int cpu);
 static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
 		    int *mpcalls, int *lockcalls, int *gcalls,
 #endif
 		    int direct);
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
  *   cc_curr         - If a callout is in progress, it is cc_curr.
  *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
  *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
  *                     cc_curr is non-NULL.
  */
 
 /*
  * Resets the execution entity tied to a specific callout cpu.
  */
 static void
 cc_cce_cleanup(struct callout_cpu *cc, int direct)
 {
 
 	cc_exec_curr(cc, direct) = NULL;
 	cc_exec_cancel(cc, direct) = false;
 	cc_exec_waiting(cc, direct) = false;
 #ifdef SMP
 	cc_migration_cpu(cc, direct) = CPUBLOCK;
 	cc_migration_time(cc, direct) = 0;
 	cc_migration_prec(cc, direct) = 0;
 	cc_migration_func(cc, direct) = NULL;
 	cc_migration_arg(cc, direct) = NULL;
 #endif
 }
 
 /*
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
 cc_cce_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
 	return (cc_migration_cpu(cc, direct) != CPUBLOCK);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Kernel low level callwheel initialization
  * called on the BSP during kernel startup.
  */
 static void
 callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Calculate the size of the callout wheel and the preallocated
 	 * timeout() structures.
 	 * XXX: Clip callout to result of previous function of maxusers
 	 * maximum 384.  This is still huge, but acceptable.
 	 */
 	memset(CC_CPU(curcpu), 0, sizeof(cc_cpu));
 	ncallout = imin(16 + maxproc + maxfiles, 18508);
 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
 
 	/*
 	 * Calculate callout wheel size, should be next power of two higher
 	 * than 'ncallout'.
 	 */
 	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Fetch whether we're pinning the swi's or not.
 	 */
 	TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
 	TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
 
 	/*
 	 * Only BSP handles timeout(9) and receives a preallocation.
 	 *
 	 * XXX: Once all timeout(9) consumers are converted this can
 	 * be removed.
 	 */
 	timeout_cpu = PCPU_GET(cpuid);
 	cc = CC_CPU(timeout_cpu);
 	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
 	    M_CALLOUT, M_WAITOK);
 	callout_cpu_init(cc, timeout_cpu);
 }
 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
 
 /*
  * Initialize the per-cpu callout structures.
  */
 static void
 callout_cpu_init(struct callout_cpu *cc, int cpu)
 {
 	struct callout *c;
 	int i;
 
 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
 	SLIST_INIT(&cc->cc_callfree);
 	cc->cc_inited = 1;
-	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
-	    M_CALLOUT, M_WAITOK);
+	cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) *
+	    callwheelsize, M_CALLOUT,
+	    DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK);
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = SBT_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
 	    "callwheel cpu %d", cpu);
 	if (cc->cc_callout == NULL)	/* Only BSP handles timeout(9) */
 		return;
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
 		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
 #ifdef SMP
 /*
  * Switches the cpu tied to a specific callout.
  * The function expects a locked incoming callout cpu and returns with
  * locked outcoming callout cpu.
  */
 static struct callout_cpu *
 callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
 {
 	struct callout_cpu *new_cc;
 
 	MPASS(c != NULL && cc != NULL);
 	CC_LOCK_ASSERT(cc);
 
 	/*
 	 * Avoid interrupts and preemption firing after the callout cpu
 	 * is blocked in order to avoid deadlocks as the new thread
 	 * may be willing to acquire the callout cpu lock.
 	 */
 	c->c_cpu = CPUBLOCK;
 	spinlock_enter();
 	CC_UNLOCK(cc);
 	new_cc = CC_CPU(new_cpu);
 	CC_LOCK(new_cc);
 	spinlock_exit();
 	c->c_cpu = new_cpu;
 	return (new_cc);
 }
 #endif
 
 /*
  * Start standard softclock thread.
  */
 static void
 start_softclock(void *dummy)
 {
 	struct callout_cpu *cc;
 	char name[MAXCOMLEN];
 #ifdef SMP
 	int cpu;
 	struct intr_event *ie;
 #endif
 
 	cc = CC_CPU(timeout_cpu);
 	snprintf(name, sizeof(name), "clock (%d)", timeout_cpu);
 	if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK,
 	    INTR_MPSAFE, &cc->cc_cookie))
 		panic("died while creating standard software ithreads");
 	if (pin_default_swi &&
 	    (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) {
 		printf("%s: timeout clock couldn't be pinned to cpu %d\n",
 		    __func__,
 		    timeout_cpu);
 	}
 
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		if (cpu == timeout_cpu)
 			continue;
 		cc = CC_CPU(cpu);
 		cc->cc_callout = NULL;	/* Only BSP handles timeout(9). */
 		callout_cpu_init(cc, cpu);
 		snprintf(name, sizeof(name), "clock (%d)", cpu);
 		ie = NULL;
 		if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
 		if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) {
 			printf("%s: per-cpu clock couldn't be pinned to "
 			    "cpu %d\n",
 			    __func__,
 			    cpu);
 		}
 	}
 #endif
 }
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
 #define	CC_HASH_SHIFT	8
 
 static inline u_int
 callout_hash(sbintime_t sbt)
 {
 
 	return (sbt >> (32 - CC_HASH_SHIFT));
 }
 
 static inline u_int
 callout_get_bucket(sbintime_t sbt)
 {
 
 	return (callout_hash(sbt) & callwheelmask);
 }
 
 void
 callout_process(sbintime_t now)
 {
 	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t first, last, max, tmp_max;
 	uint32_t lookahead;
 	u_int firstb, lastb, nowb;
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
 
 	cc = CC_SELF();
 	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
 	cc->cc_lastscan = now;
 	nowb = callout_hash(now);
 
 	/* Compute the last bucket and minimum time of the bucket after it. */
 	if (nowb == firstb)
 		lookahead = (SBT_1S / 16);
 	else if (nowb - firstb == 1)
 		lookahead = (SBT_1S / 8);
 	else
 		lookahead = (SBT_1S / 2);
 	first = last = now;
 	first += (lookahead / 2);
 	last += lookahead;
 	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
 	lastb = callout_hash(last) - 1;
 	max = last;
 
 	/*
 	 * Check if we wrapped around the entire wheel from the last scan.
 	 * In case, we need to scan entirely the wheel for pending callouts.
 	 */
 	if (lastb - firstb >= callwheelsize) {
 		lastb = firstb + callwheelsize - 1;
 		if (nowb - firstb >= callwheelsize)
 			nowb = lastb;
 	}
 
 	/* Iterate callwheel from firstb to nowb and then up to lastb. */
 	do {
 		sc = &cc->cc_callwheel[firstb & callwheelmask];
 		tmp = LIST_FIRST(sc);
 		while (tmp != NULL) {
 			/* Run the callout if present time within allowed. */
 			if (tmp->c_time <= now) {
 				/*
 				 * Consumer told us the callout may be run
 				 * directly from hardware interrupt context.
 				 */
 				if (tmp->c_iflags & CALLOUT_DIRECT) {
 #ifdef CALLOUT_PROFILING
 					++depth_dir;
 #endif
 					cc_exec_next(cc) =
 					    LIST_NEXT(tmp, c_links.le);
 					cc->cc_bucket = firstb & callwheelmask;
 					LIST_REMOVE(tmp, c_links.le);
 					softclock_call_cc(tmp, cc,
 #ifdef CALLOUT_PROFILING
 					    &mpcalls_dir, &lockcalls_dir, NULL,
 #endif
 					    1);
 					tmp = cc_exec_next(cc);
 					cc_exec_next(cc) = NULL;
 				} else {
 					tmpn = LIST_NEXT(tmp, c_links.le);
 					LIST_REMOVE(tmp, c_links.le);
 					TAILQ_INSERT_TAIL(&cc->cc_expireq,
 					    tmp, c_links.tqe);
 					tmp->c_iflags |= CALLOUT_PROCESSED;
 					tmp = tmpn;
 				}
 				continue;
 			}
 			/* Skip events from distant future. */
 			if (tmp->c_time >= max)
 				goto next;
 			/*
 			 * Event minimal time is bigger than present maximal
 			 * time, so it cannot be aggregated.
 			 */
 			if (tmp->c_time > last) {
 				lastb = nowb;
 				goto next;
 			}
 			/* Update first and last time, respecting this event. */
 			if (tmp->c_time < first)
 				first = tmp->c_time;
 			tmp_max = tmp->c_time + tmp->c_precision;
 			if (tmp_max < last)
 				last = tmp_max;
 next:
 			tmp = LIST_NEXT(tmp, c_links.le);
 		}
 		/* Proceed with the next bucket. */
 		firstb++;
 		/*
 		 * Stop if we looked after present time and found
 		 * some event we can't execute at now.
 		 * Stop if we looked far enough into the future.
 		 */
 	} while (((int)(firstb - lastb)) <= 0);
 	cc->cc_firstevent = last;
 #ifndef NO_EVENTTIMERS
 	cpu_new_callout(curcpu, last, first);
 #endif
 #ifdef CALLOUT_PROFILING
 	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
 	 */
 	if (!TAILQ_EMPTY(&cc->cc_expireq))
 		swi_sched(cc->cc_cookie, 0);
 }
 
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
 	int cpu;
 
 	for (;;) {
 		cpu = c->c_cpu;
 #ifdef SMP
 		if (cpu == CPUBLOCK) {
 			while (c->c_cpu == CPUBLOCK)
 				cpu_spinwait();
 			continue;
 		}
 #endif
 		cc = CC_CPU(cpu);
 		CC_LOCK(cc);
 		if (cpu == c->c_cpu)
 			break;
 		CC_UNLOCK(cc);
 	}
 	return (cc);
 }
 
 static void
 callout_cc_add(struct callout *c, struct callout_cpu *cc,
     sbintime_t sbt, sbintime_t precision, void (*func)(void *),
     void *arg, int cpu, int flags)
 {
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
 	if (sbt < cc->cc_lastscan)
 		sbt = cc->cc_lastscan;
 	c->c_arg = arg;
 	c->c_iflags |= CALLOUT_PENDING;
 	c->c_iflags &= ~CALLOUT_PROCESSED;
 	c->c_flags |= CALLOUT_ACTIVE;
 	if (flags & C_DIRECT_EXEC)
 		c->c_iflags |= CALLOUT_DIRECT;
 	c->c_func = func;
 	c->c_time = sbt;
 	c->c_precision = precision;
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
 	if (cc->cc_bucket == bucket)
 		cc_exec_next(cc) = c;
 #ifndef NO_EVENTTIMERS
 	/*
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
 	if (SBT_MAX - c->c_time < c->c_precision)
 		c->c_precision = SBT_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
 		cpu_new_callout(cpu, sbt, c->c_time);
 	}
 #endif
 }
 
 static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
 	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0)
 		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
 
 static void
 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
     int *mpcalls, int *lockcalls, int *gcalls,
 #endif
     int direct)
 {
 	struct rm_priotracker tracker;
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
 	uintptr_t lock_status;
 	int c_iflags;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
 	int flags, new_cpu;
 	sbintime_t new_prec, new_time;
 #endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
 	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
 
 	KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING,
 	    ("softclock_call_cc: pend %p %x", c, c->c_iflags));
 	KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE,
 	    ("softclock_call_cc: act %p %x", c, c->c_flags));
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
 	lock_status = 0;
 	if (c->c_flags & CALLOUT_SHAREDLOCK) {
 		if (class == &lock_class_rm)
 			lock_status = (uintptr_t)&tracker;
 		else
 			lock_status = 1;
 	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_iflags = c->c_iflags;
 	if (c->c_iflags & CALLOUT_LOCAL_ALLOC)
 		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 	else
 		c->c_iflags &= ~CALLOUT_PENDING;
 	
 	cc_exec_curr(cc, direct) = c;
 	cc_exec_last_func(cc, direct) = c_func;
 	cc_exec_last_arg(cc, direct) = c_arg;
 	cc_exec_cancel(cc, direct) = false;
 	cc_exec_drain(cc, direct) = NULL;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
 		class->lc_lock(c_lock, lock_status);
 		/*
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
 		if (cc_exec_cancel(cc, direct)) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
 		cc_exec_cancel(cc, direct) = true;
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
 			    c, c_func, c_arg);
 		} else {
 #ifdef CALLOUT_PROFILING
 			(*lockcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
 	} else {
 #ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 #endif
 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
 	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
 	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt1 = sbinuptime();
 #endif
 	THREAD_NO_SLEEPING();
 	SDT_PROBE1(callout_execute, , , callout__start, c);
 	c_func(c_arg);
 	SDT_PROBE1(callout_execute, , , callout__end, c);
 	THREAD_SLEEPING_OK();
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt2 = sbinuptime();
 	sbt2 -= sbt1;
 	if (sbt2 > maxdt) {
 		if (lastfunc != c_func || sbt2 > maxdt * 2) {
 			ts2 = sbttots(sbt2);
 			printf(
 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
 		}
 		maxdt = sbt2;
 		lastfunc = c_func;
 	}
 #endif
 	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
 	if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0)
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
 	KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr"));
 	cc_exec_curr(cc, direct) = NULL;
 	if (cc_exec_drain(cc, direct)) {
 		void (*drain)(void *);
 		
 		drain = cc_exec_drain(cc, direct);
 		cc_exec_drain(cc, direct) = NULL;
 		CC_UNLOCK(cc);
 		drain(c_arg);
 		CC_LOCK(cc);
 	}
 	if (cc_exec_waiting(cc, direct)) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
 		if (cc_cce_migrating(cc, direct)) {
 			cc_cce_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
 			 * destroyed but that is not easy.
 			 */
 			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 		}
 		cc_exec_waiting(cc, direct) = false;
 		CC_UNLOCK(cc);
 		wakeup(&cc_exec_waiting(cc, direct));
 		CC_LOCK(cc);
 	} else if (cc_cce_migrating(cc, direct)) {
 		KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
 		/*
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
 		new_cpu = cc_migration_cpu(cc, direct);
 		new_time = cc_migration_time(cc, direct);
 		new_prec = cc_migration_prec(cc, direct);
 		new_func = cc_migration_func(cc, direct);
 		new_arg = cc_migration_arg(cc, direct);
 		cc_cce_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
 		 * but that is not easy.
 		 *
 		 * As first thing, handle deferred callout stops.
 		 */
 		if (!callout_migrating(c)) {
 			CTR3(KTR_CALLOUT,
 			     "deferred cancelled %p func %p arg %p",
 			     c, new_func, new_arg);
 			callout_cc_del(c, cc);
 			return;
 		}
 		c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
 		flags = (direct) ? C_DIRECT_EXEC : 0;
 		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
 		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
 		panic("migration should not happen");
 #endif
 	}
 	/*
 	 * If the current callout is locally allocated (from
 	 * timeout(9)) then put it on the freelist.
 	 *
 	 * Note: we need to check the cached copy of c_iflags because
 	 * if it was not local, then it's not safe to deref the
 	 * callout pointer.
 	 */
 	KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 ||
 	    c->c_iflags == CALLOUT_LOCAL_ALLOC,
 	    ("corrupted callout"));
 	if (c_iflags & CALLOUT_LOCAL_ALLOC)
 		callout_cc_del(c, cc);
 }
 
 /*
  * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
  * used in this implementation was published by G. Varghese and T. Lauck in
  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
  * the Efficient Implementation of a Timer Facility" in the Proceedings of
  * the 11th ACM Annual Symposium on Operating Systems Principles,
  * Austin, Texas Nov 1987.
  */
 
 /*
  * Software (low priority) clock interrupt.
  * Run periodic events from timeout queue.
  */
 void
 softclock(void *arg)
 {
 	struct callout_cpu *cc;
 	struct callout *c;
 #ifdef CALLOUT_PROFILING
 	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
 #endif
 
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
 	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		softclock_call_cc(c, cc,
 #ifdef CALLOUT_PROFILING
 		    &mpcalls, &lockcalls, &gcalls,
 #endif
 		    0);
 #ifdef CALLOUT_PROFILING
 		++depth;
 #endif
 	}
 #ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
 #endif
 	CC_UNLOCK(cc);
 }
 
 /*
  * timeout --
  *	Execute a function after a specified length of time.
  *
  * untimeout --
  *	Cancel previous timeout function call.
  *
  * callout_handle_init --
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
  *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
  */
 struct callout_handle
 timeout(timeout_t *ftn, void *arg, int to_ticks)
 {
 	struct callout_cpu *cc;
 	struct callout *new;
 	struct callout_handle handle;
 
 	cc = CC_CPU(timeout_cpu);
 	CC_LOCK(cc);
 	/* Fill in the next free callout structure. */
 	new = SLIST_FIRST(&cc->cc_callfree);
 	if (new == NULL)
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
 	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
 	return (handle);
 }
 
 void
 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Check for a handle that was initialized
 	 * by callout_handle_init, but never used
 	 * for a real timeout.
 	 */
 	if (handle.callout == NULL)
 		return;
 
 	cc = callout_lock(handle.callout);
 	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
 		callout_stop(handle.callout);
 	CC_UNLOCK(cc);
 }
 
 void
 callout_handle_init(struct callout_handle *handle)
 {
 	handle->callout = NULL;
 }
 
 void
 callout_when(sbintime_t sbt, sbintime_t precision, int flags,
     sbintime_t *res, sbintime_t *prec_res)
 {
 	sbintime_t to_sbt, to_pr;
 
 	if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) {
 		*res = sbt;
 		*prec_res = precision;
 		return;
 	}
 	if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt)
 		sbt = tick_sbt;
 	if ((flags & C_HARDCLOCK) != 0 ||
 #ifdef NO_EVENTTIMERS
 	    sbt >= sbt_timethreshold) {
 		to_sbt = getsbinuptime();
 
 		/* Add safety belt for the case of hz > 1000. */
 		to_sbt += tc_tick_sbt - tick_sbt;
 #else
 	    sbt >= sbt_tickthreshold) {
 		/*
 		 * Obtain the time of the last hardclock() call on
 		 * this CPU directly from the kern_clocksource.c.
 		 * This value is per-CPU, but it is equal for all
 		 * active ones.
 		 */
 #ifdef __LP64__
 		to_sbt = DPCPU_GET(hardclocktime);
 #else
 		spinlock_enter();
 		to_sbt = DPCPU_GET(hardclocktime);
 		spinlock_exit();
 #endif
 #endif
 		if (cold && to_sbt == 0)
 			to_sbt = sbinuptime();
 		if ((flags & C_HARDCLOCK) == 0)
 			to_sbt += tick_sbt;
 	} else
 		to_sbt = sbinuptime();
 	if (SBT_MAX - to_sbt < sbt)
 		to_sbt = SBT_MAX;
 	else
 		to_sbt += sbt;
 	*res = to_sbt;
 	to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
 	    sbt >> C_PRELGET(flags));
 	*prec_res = to_pr > precision ? to_pr : precision;
 }
 
 /*
  * New interface; clients allocate their own callout structures.
  *
  * callout_reset() - establish or change a timeout
  * callout_stop() - disestablish a timeout
  * callout_init() - initialize a callout structure so that it can
  *	safely be passed to callout_reset() and callout_stop()
  *
  * <sys/callout.h> defines three convenience macros:
  *
  * callout_active() - returns truth if callout has not been stopped,
  *	drained, or deactivated since the last time the callout was
  *	reset.
  * callout_pending() - returns truth if callout is still waiting for timeout
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
 	sbintime_t to_sbt, precision;
 	struct callout_cpu *cc;
 	int cancelled, direct;
 	int ignore_cpu=0;
 
 	cancelled = 0;
 	if (cpu == -1) {
 		ignore_cpu = 1;
 	} else if ((cpu >= MAXCPU) ||
 		   ((CC_CPU(cpu))->cc_inited == 0)) {
 		/* Invalid CPU spec */
 		panic("Invalid CPU in callout %d", cpu);
 	}
 	callout_when(sbt, prec, flags, &to_sbt, &precision);
 
 	/* 
 	 * This flag used to be added by callout_cc_add, but the
 	 * first time you call this we could end up with the
 	 * wrong direct flag if we don't do it before we add.
 	 */
 	if (flags & C_DIRECT_EXEC) {
 		direct = 1;
 	} else {
 		direct = 0;
 	}
 	KASSERT(!direct || c->c_lock == NULL,
 	    ("%s: direct callout %p has lock", __func__, c));
 	cc = callout_lock(c);
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
 	 * become unbalanced or handle the case where the user does
 	 * not care. 
 	 */
 	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) ||
 	    ignore_cpu) {
 		cpu = c->c_cpu;
 	}
 
 	if (cc_exec_curr(cc, direct) == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
 		if (c->c_lock != NULL && !cc_exec_cancel(cc, direct))
 			cancelled = cc_exec_cancel(cc, direct) = true;
 		if (cc_exec_waiting(cc, direct) || cc_exec_drain(cc, direct)) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
 			 */
 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
 			    cancelled ? "cancelled" : "failed to cancel",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 #ifdef SMP
 		if (callout_migrating(c)) {
 			/* 
 			 * This only occurs when a second callout_reset_sbt_on
 			 * is made after a previous one moved it into
 			 * deferred migration (below). Note we do *not* change
 			 * the prev_cpu even though the previous target may
 			 * be different.
 			 */
 			cc_migration_cpu(cc, direct) = cpu;
 			cc_migration_time(cc, direct) = to_sbt;
 			cc_migration_prec(cc, direct) = precision;
 			cc_migration_func(cc, direct) = ftn;
 			cc_migration_arg(cc, direct) = arg;
 			cancelled = 1;
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 #endif
 	}
 	if (c->c_iflags & CALLOUT_PENDING) {
 		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
 			if (cc_exec_next(cc) == c)
 				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
 		cancelled = 1;
 		c->c_iflags &= ~ CALLOUT_PENDING;
 		c->c_flags &= ~ CALLOUT_ACTIVE;
 	}
 
 #ifdef SMP
 	/*
 	 * If the callout must migrate try to perform it immediately.
 	 * If the callout is currently running, just defer the migration
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
 		if (cc_exec_curr(cc, direct) == c) {
 			/* 
 			 * Pending will have been removed since we are
 			 * actually executing the callout on another
 			 * CPU. That callout should be waiting on the
 			 * lock the caller holds. If we set both
 			 * active/and/pending after we return and the
 			 * lock on the executing callout proceeds, it
 			 * will then see pending is true and return.
 			 * At the return from the actual callout execution
 			 * the migration will occur in softclock_call_cc
 			 * and this new callout will be placed on the 
 			 * new CPU via a call to callout_cpu_switch() which
 			 * will get the lock on the right CPU followed
 			 * by a call callout_cc_add() which will add it there.
 			 * (see above in softclock_call_cc()).
 			 */
 			cc_migration_cpu(cc, direct) = cpu;
 			cc_migration_time(cc, direct) = to_sbt;
 			cc_migration_prec(cc, direct) = precision;
 			cc_migration_func(cc, direct) = ftn;
 			cc_migration_arg(cc, direct) = arg;
 			c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING);
 			c->c_flags |= CALLOUT_ACTIVE;
 			CTR6(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
 			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 			    (u_int)(to_sbt & 0xffffffff), cpu);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 		cc = callout_cpu_switch(c, cc, cpu);
 	}
 #endif
 
 	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
 	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
 	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 	    (u_int)(to_sbt & 0xffffffff));
 	CC_UNLOCK(cc);
 
 	return (cancelled);
 }
 
 /*
  * Common idioms that can be optimized in the future.
  */
 int
 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
 }
 
 int
 callout_schedule(struct callout *c, int to_ticks)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
 }
 
 int
 _callout_stop_safe(struct callout *c, int flags, void (*drain)(void *))
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
 	int direct, sq_locked, use_lock;
 	int cancelled, not_on_a_list;
 
 	if ((flags & CS_DRAIN) != 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock,
 		    "calling %s", __func__);
 
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
 	if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
 			use_lock = 1;
 			class = LOCK_CLASS(c->c_lock);
 			class->lc_assert(c->c_lock, LA_XLOCKED);
 		}
 	} else
 		use_lock = 0;
 	if (c->c_iflags & CALLOUT_DIRECT) {
 		direct = 1;
 	} else {
 		direct = 0;
 	}
 	sq_locked = 0;
 	old_cc = NULL;
 again:
 	cc = callout_lock(c);
 
 	if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) ==
 	    (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) &&
 	    ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) {
 		/*
 		 * Special case where this slipped in while we
 		 * were migrating *as* the callout is about to
 		 * execute. The caller probably holds the lock
 		 * the callout wants.
 		 *
 		 * Get rid of the migration first. Then set
 		 * the flag that tells this code *not* to
 		 * try to remove it from any lists (its not
 		 * on one yet). When the callout wheel runs,
 		 * it will ignore this callout.
 		 */
 		c->c_iflags &= ~CALLOUT_PENDING;
 		c->c_flags &= ~CALLOUT_ACTIVE;
 		not_on_a_list = 1;
 	} else {
 		not_on_a_list = 0;
 	}
 
 	/*
 	 * If the callout was migrating while the callout cpu lock was
 	 * dropped,  just drop the sleepqueue lock and check the states
 	 * again.
 	 */
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
 		sleepq_release(&cc_exec_waiting(old_cc, direct));
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
 #else
 		panic("migration should not happen");
 #endif
 	}
 
 	/*
 	 * If the callout is running, try to stop it or drain it.
 	 */
 	if (cc_exec_curr(cc, direct) == c) {
 		/*
 		 * Succeed we to stop it or not, we must clear the
 		 * active flag - this is what API users expect.  If we're
 		 * draining and the callout is currently executing, first wait
 		 * until it finishes.
 		 */
 		if ((flags & CS_DRAIN) == 0)
 			c->c_flags &= ~CALLOUT_ACTIVE;
 
 		if ((flags & CS_DRAIN) != 0) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
 			 * just wait for the current invocation to
 			 * finish.
 			 */
 			while (cc_exec_curr(cc, direct) == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
 				 * a LOR between cc_lock and sleepqueue
 				 * chain spinlocks.  This piece of code
 				 * emulates a msleep_spin() call actually.
 				 *
 				 * If we already have the sleepqueue chain
 				 * locked, then we can safely block.  If we
 				 * don't already have it locked, however,
 				 * we have to drop the cc_lock to lock
 				 * it.  This opens several races, so we
 				 * restart at the beginning once we have
 				 * both locks.  If nothing has changed, then
 				 * we will end up back here with sq_locked
 				 * set.
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
 					sleepq_lock(
 					    &cc_exec_waiting(cc, direct));
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
 				}
 
 				/*
 				 * Migration could be cancelled here, but
 				 * as long as it is still not sure when it
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
 				cc_exec_waiting(cc, direct) = true;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
 				sleepq_add(
 				    &cc_exec_waiting(cc, direct),
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
 				sleepq_wait(
 				    &cc_exec_waiting(cc, direct),
 					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
 				/* Reacquire locks previously released. */
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
 			c->c_flags &= ~CALLOUT_ACTIVE;
 		} else if (use_lock &&
 			   !cc_exec_cancel(cc, direct) && (drain == NULL)) {
 			
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
 			 * and return.  After our caller drops the
 			 * lock, the callout will be skipped in
 			 * softclock(). This *only* works with a
 			 * callout_stop() *not* callout_drain() or
 			 * callout_async_drain().
 			 */
 			cc_exec_cancel(cc, direct) = true;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			KASSERT(!cc_cce_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
 			if (callout_migrating(c)) {
 				c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 #ifdef SMP
 				cc_migration_cpu(cc, direct) = CPUBLOCK;
 				cc_migration_time(cc, direct) = 0;
 				cc_migration_prec(cc, direct) = 0;
 				cc_migration_func(cc, direct) = NULL;
 				cc_migration_arg(cc, direct) = NULL;
 #endif
 			}
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
 		} else if (callout_migrating(c)) {
 			/*
 			 * The callout is currently being serviced
 			 * and the "next" callout is scheduled at
 			 * its completion with a migration. We remove
 			 * the migration flag so it *won't* get rescheduled,
 			 * but we can't stop the one thats running so
 			 * we return 0.
 			 */
 			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 #ifdef SMP
 			/* 
 			 * We can't call cc_cce_cleanup here since
 			 * if we do it will remove .ce_curr and
 			 * its still running. This will prevent a
 			 * reschedule of the callout when the 
 			 * execution completes.
 			 */
 			cc_migration_cpu(cc, direct) = CPUBLOCK;
 			cc_migration_time(cc, direct) = 0;
 			cc_migration_prec(cc, direct) = 0;
 			cc_migration_func(cc, direct) = NULL;
 			cc_migration_arg(cc, direct) = NULL;
 #endif
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
  			if (drain) {
 				cc_exec_drain(cc, direct) = drain;
 			}
 			CC_UNLOCK(cc);
 			return ((flags & CS_EXECUTING) != 0);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		if (drain) {
 			cc_exec_drain(cc, direct) = drain;
 		}
 		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
 		cancelled = ((flags & CS_EXECUTING) != 0);
 	} else
 		cancelled = 1;
 
 	if (sq_locked)
 		sleepq_release(&cc_exec_waiting(cc, direct));
 
 	if ((c->c_iflags & CALLOUT_PENDING) == 0) {
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		/*
 		 * For not scheduled and not executing callout return
 		 * negative value.
 		 */
 		if (cc_exec_curr(cc, direct) != c)
 			cancelled = -1;
 		CC_UNLOCK(cc);
 		return (cancelled);
 	}
 
 	c->c_iflags &= ~CALLOUT_PENDING;
 	c->c_flags &= ~CALLOUT_ACTIVE;
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
 	if (not_on_a_list == 0) {
 		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
 			if (cc_exec_next(cc) == c)
 				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
 	}
 	callout_cc_del(c, cc);
 	CC_UNLOCK(cc);
 	return (cancelled);
 }
 
 void
 callout_init(struct callout *c, int mpsafe)
 {
 	bzero(c, sizeof *c);
 	if (mpsafe) {
 		c->c_lock = NULL;
 		c->c_iflags = CALLOUT_RETURNUNLOCKED;
 	} else {
 		c->c_lock = &Giant.lock_object;
 		c->c_iflags = 0;
 	}
 	c->c_cpu = timeout_cpu;
 }
 
 void
 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
 {
 	bzero(c, sizeof *c);
 	c->c_lock = lock;
 	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
 	    ("callout_init_lock: bad flags %d", flags));
 	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
 	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
 	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
 	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
 	    __func__));
 	c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
 #ifdef APM_FIXUP_CALLTODO
 /* 
  * Adjust the kernel calltodo timeout list.  This routine is used after 
  * an APM resume to recalculate the calltodo timer list values with the 
  * number of hz's we have been sleeping.  The next hardclock() will detect 
  * that there are fired timers and run softclock() to execute them.
  *
  * Please note, I have not done an exhaustive analysis of what code this
  * might break.  I am motivated to have my select()'s and alarm()'s that
  * have expired during suspend firing upon resume so that the applications
  * which set the timer can do the maintanence the timer was for as close
  * as possible to the originally intended time.  Testing this code for a 
  * week showed that resuming from a suspend resulted in 22 to 25 timers 
  * firing, which seemed independent on whether the suspend was 2 hours or
  * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
  */
 void
 adjust_timeout_calltodo(struct timeval *time_change)
 {
 	struct callout *p;
 	unsigned long delta_ticks;
 
 	/* 
 	 * How many ticks were we asleep?
 	 * (stolen from tvtohz()).
 	 */
 
 	/* Don't do anything */
 	if (time_change->tv_sec < 0)
 		return;
 	else if (time_change->tv_sec <= LONG_MAX / 1000000)
 		delta_ticks = howmany(time_change->tv_sec * 1000000 +
 		    time_change->tv_usec, tick) + 1;
 	else if (time_change->tv_sec <= LONG_MAX / hz)
 		delta_ticks = time_change->tv_sec * hz +
 		    howmany(time_change->tv_usec, tick) + 1;
 	else
 		delta_ticks = LONG_MAX;
 
 	if (delta_ticks > INT_MAX)
 		delta_ticks = INT_MAX;
 
 	/* 
 	 * Now rip through the timer calltodo list looking for timers
 	 * to expire.
 	 */
 
 	/* don't collide with softclock() */
 	CC_LOCK(cc);
 	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
 		p->c_time -= delta_ticks;
 
 		/* Break if the timer had more time on it than delta_ticks */
 		if (p->c_time > 0)
 			break;
 
 		/* take back the ticks the timer didn't use (p->c_time <= 0) */
 		delta_ticks = -p->c_time;
 	}
 	CC_UNLOCK(cc);
 
 	return;
 }
 #endif /* APM_FIXUP_CALLTODO */
 
 static int
 flssbt(sbintime_t sbt)
 {
 
 	sbt += (uint64_t)sbt >> 1;
 	if (sizeof(long) >= sizeof(sbintime_t))
 		return (flsl(sbt));
 	if (sbt >= SBT_1S)
 		return (flsl(((uint64_t)sbt) >> 32) + 32);
 	return (flsl(sbt));
 }
 
 /*
  * Dump immediate statistic snapshot of the scheduled callouts.
  */
 static int
 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
 {
 	struct callout *tmp;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
 	int ct[64], cpr[64], ccpbk[32];
 	int error, val, i, count, tcum, pcum, maxc, c, medc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	count = maxc = 0;
 	st = spr = maxt = maxpr = 0;
 	bzero(ccpbk, sizeof(ccpbk));
 	bzero(ct, sizeof(ct));
 	bzero(cpr, sizeof(cpr));
 	now = sbinuptime();
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		cc = CC_CPU(cpu);
 #else
 		cc = CC_CPU(timeout_cpu);
 #endif
 		CC_LOCK(cc);
 		for (i = 0; i < callwheelsize; i++) {
 			sc = &cc->cc_callwheel[i];
 			c = 0;
 			LIST_FOREACH(tmp, sc, c_links.le) {
 				c++;
 				t = tmp->c_time - now;
 				if (t < 0)
 					t = 0;
 				st += t / SBT_1US;
 				spr += tmp->c_precision / SBT_1US;
 				if (t > maxt)
 					maxt = t;
 				if (tmp->c_precision > maxpr)
 					maxpr = tmp->c_precision;
 				ct[flssbt(t)]++;
 				cpr[flssbt(tmp->c_precision)]++;
 			}
 			if (c > maxc)
 				maxc = c;
 			ccpbk[fls(c + c / 2)]++;
 			count += c;
 		}
 		CC_UNLOCK(cc);
 #ifdef SMP
 	}
 #endif
 
 	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
 		tcum += ct[i];
 	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
 		pcum += cpr[i];
 	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
 		c += ccpbk[i];
 	medc = (i >= 2) ? (1 << (i - 2)) : 0;
 
 	printf("Scheduled callouts statistic snapshot:\n");
 	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
 	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
 	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
 	    medc,
 	    count / callwheelsize / mp_ncpus,
 	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
 	    maxc);
 	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
 	    (st / count) / 1000000, (st / count) % 1000000,
 	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
 	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
 	    (spr / count) / 1000000, (spr / count) % 1000000,
 	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
 	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
 	    "   prec\t   pcum\n");
 	for (i = 0, tcum = pcum = 0; i < 64; i++) {
 		if (ct[i] == 0 && cpr[i] == 0)
 			continue;
 		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
 		tcum += ct[i];
 		pcum += cpr[i];
 		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
 		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
 		    i - 1 - (32 - CC_HASH_SHIFT),
 		    ct[i], tcum, cpr[i], pcum);
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_callout_stat, "I",
     "Dump immediate statistic snapshot of the scheduled callouts");
 
 #ifdef DDB
 static void
 _show_callout(struct callout *c)
 {
 
 	db_printf("callout %p\n", c);
 #define	C_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, c->e);
 	db_printf("   &c_links = %p\n", &(c->c_links));
 	C_DB_PRINTF("%" PRId64,	c_time);
 	C_DB_PRINTF("%" PRId64,	c_precision);
 	C_DB_PRINTF("%p",	c_arg);
 	C_DB_PRINTF("%p",	c_func);
 	C_DB_PRINTF("%p",	c_lock);
 	C_DB_PRINTF("%#x",	c_flags);
 	C_DB_PRINTF("%#x",	c_iflags);
 	C_DB_PRINTF("%d",	c_cpu);
 #undef	C_DB_PRINTF
 }
 
 DB_SHOW_COMMAND(callout, db_show_callout)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show callout <struct callout *>\n");
 		return;
 	}
 
 	_show_callout((struct callout *)addr);
 }
 
 static void
 _show_last_callout(int cpu, int direct, const char *dirstr)
 {
 	struct callout_cpu *cc;
 	void *func, *arg;
 
 	cc = CC_CPU(cpu);
 	func = cc_exec_last_func(cc, direct);
 	arg = cc_exec_last_arg(cc, direct);
 	db_printf("cpu %d last%s callout function: %p ", cpu, dirstr, func);
 	db_printsym((db_expr_t)func, DB_STGY_ANY);
 	db_printf("\ncpu %d last%s callout argument: %p\n", cpu, dirstr, arg);
 }
 
 DB_SHOW_COMMAND(callout_last, db_show_callout_last)
 {
 	int cpu, last;
 
 	if (have_addr) {
 		if (addr < 0 || addr > mp_maxid || CPU_ABSENT(addr)) {
 			db_printf("no such cpu: %d\n", (int)addr);
 			return;
 		}
 		cpu = last = addr;
 	} else {
 		cpu = 0;
 		last = mp_maxid;
 	}
 
 	while (cpu <= last) {
 		if (!CPU_ABSENT(cpu)) {
 			_show_last_callout(cpu, 0, "");
 			_show_last_callout(cpu, 1, " direct");
 		}
 		cpu++;
 	}
 }
 #endif /* DDB */
Index: projects/clang900-import/sys/kern/subr_lock.c
===================================================================
--- projects/clang900-import/sys/kern/subr_lock.c	(revision 352586)
+++ projects/clang900-import/sys/kern/subr_lock.c	(revision 352587)
@@ -1,703 +1,698 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This module holds the global variables and functions used to maintain
  * lock_object structures.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mprof.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/lock_profile.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/cpufunc.h>
 
 SDT_PROVIDER_DEFINE(lock);
 SDT_PROBE_DEFINE1(lock, , , starvation, "u_int");
 
 CTASSERT(LOCK_CLASS_MAX == 15);
 
 struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = {
 	&lock_class_mtx_spin,
 	&lock_class_mtx_sleep,
 	&lock_class_sx,
 	&lock_class_rm,
 	&lock_class_rm_sleepable,
 	&lock_class_rw,
 	&lock_class_lockmgr,
 };
 
 void
 lock_init(struct lock_object *lock, struct lock_class *class, const char *name,
     const char *type, int flags)
 {
 	int i;
 
 	/* Check for double-init and zero object. */
 	KASSERT(flags & LO_NEW || !lock_initialized(lock),
 	    ("lock \"%s\" %p already initialized", name, lock));
 
 	/* Look up lock class to find its index. */
 	for (i = 0; i < LOCK_CLASS_MAX; i++)
 		if (lock_classes[i] == class) {
 			lock->lo_flags = i << LO_CLASSSHIFT;
 			break;
 		}
 	KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class));
 
 	/* Initialize the lock object. */
 	lock->lo_name = name;
 	lock->lo_flags |= flags | LO_INITIALIZED;
 	LOCK_LOG_INIT(lock, 0);
 	WITNESS_INIT(lock, (type != NULL) ? type : name);
 }
 
 void
 lock_destroy(struct lock_object *lock)
 {
 
 	KASSERT(lock_initialized(lock), ("lock %p is not initialized", lock));
 	WITNESS_DESTROY(lock);
 	LOCK_LOG_DESTROY(lock, 0);
 	lock->lo_flags &= ~LO_INITIALIZED;
 }
 
 static SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging");
 static SYSCTL_NODE(_debug_lock, OID_AUTO, delay, CTLFLAG_RD, NULL,
     "lock delay");
 
 static u_int __read_mostly starvation_limit = 131072;
 SYSCTL_INT(_debug_lock_delay, OID_AUTO, starvation_limit, CTLFLAG_RW,
     &starvation_limit, 0, "");
 
 static u_int __read_mostly restrict_starvation = 0;
 SYSCTL_INT(_debug_lock_delay, OID_AUTO, restrict_starvation, CTLFLAG_RW,
     &restrict_starvation, 0, "");
 
 void
 lock_delay(struct lock_delay_arg *la)
 {
 	struct lock_delay_config *lc = la->config;
 	u_int i;
 
 	la->delay <<= 1;
 	if (__predict_false(la->delay > lc->max))
 		la->delay = lc->max;
 
 	for (i = la->delay; i > 0; i--)
 		cpu_spinwait();
 
 	la->spin_cnt += la->delay;
 	if (__predict_false(la->spin_cnt > starvation_limit)) {
 		SDT_PROBE1(lock, , , starvation, la->delay);
 		if (restrict_starvation)
 			la->delay = lc->base;
 	}
 }
 
 static u_int
 lock_roundup_2(u_int val)
 {
 	u_int res;
 
 	for (res = 1; res <= val; res <<= 1)
 		continue;
 
 	return (res);
 }
 
 void
 lock_delay_default_init(struct lock_delay_config *lc)
 {
 
 	lc->base = 1;
 	lc->max = lock_roundup_2(mp_ncpus) * 256;
 	if (lc->max > 32678)
 		lc->max = 32678;
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(lock, db_show_lock)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 
 	if (!have_addr)
 		return;
 	lock = (struct lock_object *)addr;
 	if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) {
 		db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock));
 		return;
 	}
 	class = LOCK_CLASS(lock);
 	db_printf(" class: %s\n", class->lc_name);
 	db_printf(" name: %s\n", lock->lo_name);
 	class->lc_ddb_show(lock);
 }
 #endif
 
 #ifdef LOCK_PROFILING
 
 /*
  * One object per-thread for each lock the thread owns.  Tracks individual
  * lock instances.
  */
 struct lock_profile_object {
 	LIST_ENTRY(lock_profile_object) lpo_link;
 	struct lock_object *lpo_obj;
 	const char	*lpo_file;
 	int		lpo_line;
 	uint16_t	lpo_ref;
 	uint16_t	lpo_cnt;
 	uint64_t	lpo_acqtime;
 	uint64_t	lpo_waittime;
 	u_int		lpo_contest_locking;
 };
 
 /*
  * One lock_prof for each (file, line, lock object) triple.
  */
 struct lock_prof {
 	SLIST_ENTRY(lock_prof) link;
 	struct lock_class *class;
 	const char	*file;
 	const char	*name;
 	int		line;
 	int		ticks;
 	uintmax_t	cnt_wait_max;
 	uintmax_t	cnt_max;
 	uintmax_t	cnt_tot;
 	uintmax_t	cnt_wait;
 	uintmax_t	cnt_cur;
 	uintmax_t	cnt_contest_locking;
 };
 
 SLIST_HEAD(lphead, lock_prof);
 
 #define	LPROF_HASH_SIZE		4096
 #define	LPROF_HASH_MASK		(LPROF_HASH_SIZE - 1)
 #define	LPROF_CACHE_SIZE	4096
 
 /*
  * Array of objects and profs for each type of object for each cpu.  Spinlocks
  * are handled separately because a thread may be preempted and acquire a
  * spinlock while in the lock profiling code of a non-spinlock.  In this way
  * we only need a critical section to protect the per-cpu lists.
  */
 struct lock_prof_type {
 	struct lphead		lpt_lpalloc;
 	struct lpohead		lpt_lpoalloc;
 	struct lphead		lpt_hash[LPROF_HASH_SIZE];
 	struct lock_prof	lpt_prof[LPROF_CACHE_SIZE];
 	struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE];
 };
 
 struct lock_prof_cpu {
 	struct lock_prof_type	lpc_types[2]; /* One for spin one for other. */
 };
 
 struct lock_prof_cpu *lp_cpu[MAXCPU];
 
 volatile int __read_mostly lock_prof_enable;
 static volatile int lock_prof_resetting;
 
 #define LPROF_SBUF_SIZE		256
 
 static int lock_prof_rejected;
 static int lock_prof_skipspin;
 static int lock_prof_skipcount;
 
 #ifndef USE_CPU_NANOSECONDS
 uint64_t
 nanoseconds(void)
 {
 	struct bintime bt;
 	uint64_t ns;
 
 	binuptime(&bt);
 	/* From bintime2timespec */
 	ns = bt.sec * (uint64_t)1000000000;
 	ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
 	return (ns);
 }
 #endif
 
 static void
 lock_prof_init_type(struct lock_prof_type *type)
 {
 	int i;
 
 	SLIST_INIT(&type->lpt_lpalloc);
 	LIST_INIT(&type->lpt_lpoalloc);
 	for (i = 0; i < LPROF_CACHE_SIZE; i++) {
 		SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i],
 		    link);
 		LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i],
 		    lpo_link);
 	}
 }
 
 static void
 lock_prof_init(void *arg)
 {
 	int cpu;
 
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+	CPU_FOREACH(cpu) {
 		lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF,
 		    M_WAITOK | M_ZERO);
 		lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]);
 		lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]);
 	}
 }
 SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL);
 
 static void
 lock_prof_reset_wait(void)
 {
 
 	/*
 	 * Spin relinquishing our cpu so that quiesce_all_cpus may
 	 * complete.
 	 */
 	while (lock_prof_resetting)
 		sched_relinquish(curthread);
 }
 
 static void
 lock_prof_reset(void)
 {
 	struct lock_prof_cpu *lpc;
 	int enabled, i, cpu;
 
 	/*
 	 * We not only race with acquiring and releasing locks but also
 	 * thread exit.  To be certain that threads exit without valid head
 	 * pointers they must see resetting set before enabled is cleared.
 	 * Otherwise a lock may not be removed from a per-thread list due
 	 * to disabled being set but not wait for reset() to remove it below.
 	 */
 	atomic_store_rel_int(&lock_prof_resetting, 1);
 	enabled = lock_prof_enable;
 	lock_prof_enable = 0;
 	quiesce_all_cpus("profreset", 0);
 	/*
 	 * Some objects may have migrated between CPUs.  Clear all links
 	 * before we zero the structures.  Some items may still be linked
 	 * into per-thread lists as well.
 	 */
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+	CPU_FOREACH(cpu) {
 		lpc = lp_cpu[cpu];
 		for (i = 0; i < LPROF_CACHE_SIZE; i++) {
 			LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link);
 			LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link);
 		}
 	}
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+	CPU_FOREACH(cpu) {
 		lpc = lp_cpu[cpu];
 		bzero(lpc, sizeof(*lpc));
 		lock_prof_init_type(&lpc->lpc_types[0]);
 		lock_prof_init_type(&lpc->lpc_types[1]);
 	}
 	atomic_store_rel_int(&lock_prof_resetting, 0);
 	lock_prof_enable = enabled;
 }
 
 static void
 lock_prof_output(struct lock_prof *lp, struct sbuf *sb)
 {
 	const char *p;
 
 	for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3);
 	sbuf_printf(sb,
 	    "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n",
 	    lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000,
 	    lp->cnt_wait / 1000, lp->cnt_cur,
 	    lp->cnt_cur == 0 ? (uintmax_t)0 :
 	    lp->cnt_tot / (lp->cnt_cur * 1000),
 	    lp->cnt_cur == 0 ? (uintmax_t)0 :
 	    lp->cnt_wait / (lp->cnt_cur * 1000),
 	    (uintmax_t)0, lp->cnt_contest_locking,
 	    p, lp->line, lp->class->lc_name, lp->name);
 }
 
 static void
 lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash,
     int spin, int t)
 {
 	struct lock_prof_type *type;
 	struct lock_prof *l;
 	int cpu;
 
 	dst->file = match->file;
 	dst->line = match->line;
 	dst->class = match->class;
 	dst->name = match->name;
 
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
-		if (lp_cpu[cpu] == NULL)
-			continue;
+	CPU_FOREACH(cpu) {
 		type = &lp_cpu[cpu]->lpc_types[spin];
 		SLIST_FOREACH(l, &type->lpt_hash[hash], link) {
 			if (l->ticks == t)
 				continue;
 			if (l->file != match->file || l->line != match->line ||
 			    l->name != match->name)
 				continue;
 			l->ticks = t;
 			if (l->cnt_max > dst->cnt_max)
 				dst->cnt_max = l->cnt_max;
 			if (l->cnt_wait_max > dst->cnt_wait_max)
 				dst->cnt_wait_max = l->cnt_wait_max;
 			dst->cnt_tot += l->cnt_tot;
 			dst->cnt_wait += l->cnt_wait;
 			dst->cnt_cur += l->cnt_cur;
 			dst->cnt_contest_locking += l->cnt_contest_locking;
 		}
 	}
-	
 }
 
 static void
 lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin,
     int t)
 {
 	struct lock_prof *l;
 	int i;
 
 	for (i = 0; i < LPROF_HASH_SIZE; ++i) {
 		SLIST_FOREACH(l, &type->lpt_hash[i], link) {
 			struct lock_prof lp = {};
 
 			if (l->ticks == t)
 				continue;
 			lock_prof_sum(l, &lp, i, spin, t);
 			lock_prof_output(&lp, sb);
 		}
 	}
 }
 
 static int
 dump_lock_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	int error, cpu, t;
 	int enabled;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req);
 	sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n",
 	    "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name");
 	enabled = lock_prof_enable;
 	lock_prof_enable = 0;
 	quiesce_all_cpus("profstat", 0);
 	t = ticks;
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
-		if (lp_cpu[cpu] == NULL)
-			continue;
+	CPU_FOREACH(cpu) {
 		lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t);
 		lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t);
 	}
 	lock_prof_enable = enabled;
 
 	error = sbuf_finish(sb);
 	/* Output a trailing NUL. */
 	if (error == 0)
 		error = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (error);
 }
 
 static int
 enable_lock_prof(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = lock_prof_enable;
 	error = sysctl_handle_int(oidp, &v, v, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == lock_prof_enable)
 		return (0);
 	if (v == 1)
 		lock_prof_reset();
 	lock_prof_enable = !!v;
 
 	return (0);
 }
 
 static int
 reset_lock_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = 0;
 	error = sysctl_handle_int(oidp, &v, 0, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == 0)
 		return (0);
 	lock_prof_reset();
 
 	return (0);
 }
 
 static struct lock_prof *
 lock_profile_lookup(struct lock_object *lo, int spin, const char *file,
     int line)
 {
 	const char *unknown = "(unknown)";
 	struct lock_prof_type *type;
 	struct lock_prof *lp;
 	struct lphead *head;
 	const char *p;
 	u_int hash;
 
 	p = file;
 	if (p == NULL || *p == '\0')
 		p = unknown;
 	hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line;
 	hash &= LPROF_HASH_MASK;
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	head = &type->lpt_hash[hash];
 	SLIST_FOREACH(lp, head, link) {
 		if (lp->line == line && lp->file == p &&
 		    lp->name == lo->lo_name)
 			return (lp);
 
 	}
 	lp = SLIST_FIRST(&type->lpt_lpalloc);
 	if (lp == NULL) {
 		lock_prof_rejected++;
 		return (lp);
 	}
 	SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link);
 	lp->file = p;
 	lp->line = line;
 	lp->class = LOCK_CLASS(lo);
 	lp->name = lo->lo_name;
 	SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link);
 	return (lp);
 }
 
 static struct lock_profile_object *
 lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file,
     int line)
 {
 	struct lock_profile_object *l;
 	struct lock_prof_type *type;
 	struct lpohead *head;
 
 	head = &curthread->td_lprof[spin];
 	LIST_FOREACH(l, head, lpo_link)
 		if (l->lpo_obj == lo && l->lpo_file == file &&
 		    l->lpo_line == line)
 			return (l);
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	l = LIST_FIRST(&type->lpt_lpoalloc);
 	if (l == NULL) {
 		lock_prof_rejected++;
 		return (NULL);
 	}
 	LIST_REMOVE(l, lpo_link);
 	l->lpo_obj = lo;
 	l->lpo_file = file;
 	l->lpo_line = line;
 	l->lpo_cnt = 0;
 	LIST_INSERT_HEAD(head, l, lpo_link);
 
 	return (l);
 }
 
 void
 lock_profile_obtain_lock_success(struct lock_object *lo, int contested,
     uint64_t waittime, const char *file, int line)
 {
 	static int lock_prof_count;
 	struct lock_profile_object *l;
 	int spin;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	/* don't reset the timer when/if recursing */
 	if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE))
 		return;
 	if (lock_prof_skipcount &&
 	    (++lock_prof_count % lock_prof_skipcount) != 0)
 		return;
 	spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
 	if (spin && lock_prof_skipspin == 1)
 		return;
 	critical_enter();
 	/* Recheck enabled now that we're in a critical section. */
 	if (lock_prof_enable == 0)
 		goto out;
 	l = lock_profile_object_lookup(lo, spin, file, line);
 	if (l == NULL)
 		goto out;
 	l->lpo_cnt++;
 	if (++l->lpo_ref > 1)
 		goto out;
 	l->lpo_contest_locking = contested;
 	l->lpo_acqtime = nanoseconds(); 
 	if (waittime && (l->lpo_acqtime > waittime))
 		l->lpo_waittime = l->lpo_acqtime - waittime;
 	else
 		l->lpo_waittime = 0;
 out:
 	critical_exit();
 }
 
 void
 lock_profile_thread_exit(struct thread *td)
 {
 #ifdef INVARIANTS
 	struct lock_profile_object *l;
 
 	MPASS(curthread->td_critnest == 0);
 #endif
 	/*
 	 * If lock profiling was disabled we have to wait for reset to
 	 * clear our pointers before we can exit safely.
 	 */
 	lock_prof_reset_wait();
 #ifdef INVARIANTS
 	LIST_FOREACH(l, &td->td_lprof[0], lpo_link)
 		printf("thread still holds lock acquired at %s:%d\n",
 		    l->lpo_file, l->lpo_line);
 	LIST_FOREACH(l, &td->td_lprof[1], lpo_link)
 		printf("thread still holds lock acquired at %s:%d\n",
 		    l->lpo_file, l->lpo_line);
 #endif
 	MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL);
 	MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL);
 }
 
 void
 lock_profile_release_lock(struct lock_object *lo)
 {
 	struct lock_profile_object *l;
 	struct lock_prof_type *type;
 	struct lock_prof *lp;
 	uint64_t curtime, holdtime;
 	struct lpohead *head;
 	int spin;
 
 	if (SCHEDULER_STOPPED())
 		return;
 	if (lo->lo_flags & LO_NOPROFILE)
 		return;
 	spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
 	head = &curthread->td_lprof[spin];
 	if (LIST_FIRST(head) == NULL)
 		return;
 	critical_enter();
 	/* Recheck enabled now that we're in a critical section. */
 	if (lock_prof_enable == 0 && lock_prof_resetting == 1)
 		goto out;
 	/*
 	 * If lock profiling is not enabled we still want to remove the
 	 * lpo from our queue.
 	 */
 	LIST_FOREACH(l, head, lpo_link)
 		if (l->lpo_obj == lo)
 			break;
 	if (l == NULL)
 		goto out;
 	if (--l->lpo_ref > 0)
 		goto out;
 	lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line);
 	if (lp == NULL)
 		goto release;
 	curtime = nanoseconds();
 	if (curtime < l->lpo_acqtime)
 		goto release;
 	holdtime = curtime - l->lpo_acqtime;
 
 	/*
 	 * Record if the lock has been held longer now than ever
 	 * before.
 	 */
 	if (holdtime > lp->cnt_max)
 		lp->cnt_max = holdtime;
 	if (l->lpo_waittime > lp->cnt_wait_max)
 		lp->cnt_wait_max = l->lpo_waittime;
 	lp->cnt_tot += holdtime;
 	lp->cnt_wait += l->lpo_waittime;
 	lp->cnt_contest_locking += l->lpo_contest_locking;
 	lp->cnt_cur += l->lpo_cnt;
 release:
 	LIST_REMOVE(l, lpo_link);
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link);
 out:
 	critical_exit();
 }
 
 static SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL,
     "lock profiling");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW,
     &lock_prof_skipspin, 0, "Skip profiling on spinlocks.");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW,
     &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions.");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD,
     &lock_prof_rejected, 0, "Number of rejected profiling records");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, enable_lock_prof, "I", "Enable lock profiling");
 
 #endif
Index: projects/clang900-import/sys/kern/uipc_ktls.c
===================================================================
--- projects/clang900-import/sys/kern/uipc_ktls.c	(revision 352586)
+++ projects/clang900-import/sys/kern/uipc_ktls.c	(revision 352587)
@@ -1,1450 +1,1450 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014-2019 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/kthread.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 #include <machine/pcb.h>
 #endif
 #include <machine/vmparam.h>
 #ifdef RSS
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #endif
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #endif
 #include <netinet/tcp_var.h>
 #include <opencrypto/xform.h>
 #include <vm/uma_dbg.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 
 struct ktls_wq {
 	struct mtx	mtx;
 	STAILQ_HEAD(, mbuf_ext_pgs) head;
 	bool		running;
 } __aligned(CACHE_LINE_SIZE);
 
 static struct ktls_wq *ktls_wq;
 static struct proc *ktls_proc;
 LIST_HEAD(, ktls_crypto_backend) ktls_backends;
 static struct rmlock ktls_backends_lock;
 static uma_zone_t ktls_session_zone;
 static uint16_t ktls_cpuid_lookup[MAXCPU];
 
 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW, 0,
     "Kernel TLS offload");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW, 0,
     "Kernel TLS offload stats");
 
 static int ktls_allow_unload;
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, allow_unload, CTLFLAG_RDTUN,
     &ktls_allow_unload, 0, "Allow software crypto modules to unload");
 
 #ifdef RSS
 static int ktls_bind_threads = 1;
 #else
 static int ktls_bind_threads;
 #endif
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
     &ktls_bind_threads, 0,
     "Bind crypto threads to cores or domains at boot");
 
 static u_int ktls_maxlen = 16384;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RWTUN,
     &ktls_maxlen, 0, "Maximum TLS record size");
 
 static int ktls_number_threads;
 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
     &ktls_number_threads, 0,
     "Number of TLS threads in thread-pool");
 
 static bool ktls_offload_enable;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RW,
     &ktls_offload_enable, 0,
     "Enable support for kernel TLS offload");
 
 static bool ktls_cbc_enable = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RW,
     &ktls_cbc_enable, 1,
     "Enable Support of AES-CBC crypto for kernel TLS");
 
 static counter_u64_t ktls_tasks_active;
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
 static counter_u64_t ktls_cnt_on;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, so_inqueue, CTLFLAG_RD,
     &ktls_cnt_on, "Number of TLS records in queue to tasks for SW crypto");
 
 static counter_u64_t ktls_offload_total;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
     CTLFLAG_RD, &ktls_offload_total,
     "Total successful TLS setups (parameters set)");
 
 static counter_u64_t ktls_offload_enable_calls;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
     CTLFLAG_RD, &ktls_offload_enable_calls,
     "Total number of TLS enable calls made");
 
 static counter_u64_t ktls_offload_active;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
     &ktls_offload_active, "Total Active TLS sessions");
 
 static counter_u64_t ktls_offload_failed_crypto;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
     &ktls_offload_failed_crypto, "Total TLS crypto failures");
 
 static counter_u64_t ktls_switch_to_ifnet;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
 
 static counter_u64_t ktls_switch_to_sw;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
 
 static counter_u64_t ktls_switch_failed;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
 
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD, 0,
     "Hardware (ifnet) TLS session stats");
 
 static counter_u64_t ktls_sw_cbc;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
     "Active number of software TLS sessions using AES-CBC");
 
 static counter_u64_t ktls_sw_gcm;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
     "Active number of software TLS sessions using AES-GCM");
 
 static counter_u64_t ktls_ifnet_cbc;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_ifnet_cbc,
     "Active number of ifnet TLS sessions using AES-CBC");
 
 static counter_u64_t ktls_ifnet_gcm;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_ifnet_gcm,
     "Active number of ifnet TLS sessions using AES-GCM");
 
 static counter_u64_t ktls_ifnet_reset;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
 
 static counter_u64_t ktls_ifnet_reset_dropped;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
     &ktls_ifnet_reset_dropped,
     "TLS sessions dropped after failing to update ifnet send tag");
 
 static counter_u64_t ktls_ifnet_reset_failed;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
     &ktls_ifnet_reset_failed,
     "TLS sessions that failed to allocate a new ifnet send tag");
 
 static int ktls_ifnet_permitted;
 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
     &ktls_ifnet_permitted, 1,
     "Whether to permit hardware (ifnet) TLS sessions");
 
 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
 
 static void ktls_cleanup(struct ktls_session *tls);
 #if defined(INET) || defined(INET6)
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
 
 int
 ktls_crypto_backend_register(struct ktls_crypto_backend *be)
 {
 	struct ktls_crypto_backend *curr_be, *tmp;
 
 	if (be->api_version != KTLS_API_VERSION) {
 		printf("KTLS: API version mismatch (%d vs %d) for %s\n",
 		    be->api_version, KTLS_API_VERSION,
 		    be->name);
 		return (EINVAL);
 	}
 
 	rm_wlock(&ktls_backends_lock);
 	printf("KTLS: Registering crypto method %s with prio %d\n",
 	       be->name, be->prio);
 	if (LIST_EMPTY(&ktls_backends)) {
 		LIST_INSERT_HEAD(&ktls_backends, be, next);
 	} else {
 		LIST_FOREACH_SAFE(curr_be, &ktls_backends, next, tmp) {
 			if (curr_be->prio < be->prio) {
 				LIST_INSERT_BEFORE(curr_be, be, next);
 				break;
 			}
 			if (LIST_NEXT(curr_be, next) == NULL) {
 				LIST_INSERT_AFTER(curr_be, be, next);
 				break;
 			}
 		}
 	}
 	rm_wunlock(&ktls_backends_lock);
 	return (0);
 }
 
 int
 ktls_crypto_backend_deregister(struct ktls_crypto_backend *be)
 {
 	struct ktls_crypto_backend *tmp;
 
 	/*
 	 * Don't error if the backend isn't registered.  This permits
 	 * MOD_UNLOAD handlers to use this function unconditionally.
 	 */
 	rm_wlock(&ktls_backends_lock);
 	LIST_FOREACH(tmp, &ktls_backends, next) {
 		if (tmp == be)
 			break;
 	}
 	if (tmp == NULL) {
 		rm_wunlock(&ktls_backends_lock);
 		return (0);
 	}
 
 	if (!ktls_allow_unload) {
 		rm_wunlock(&ktls_backends_lock);
 		printf(
 		    "KTLS: Deregistering crypto method %s is not supported\n",
 		    be->name);
 		return (EBUSY);
 	}
 
 	if (be->use_count) {
 		rm_wunlock(&ktls_backends_lock);
 		return (EBUSY);
 	}
 
 	LIST_REMOVE(be, next);
 	rm_wunlock(&ktls_backends_lock);
 	return (0);
 }
 
 #if defined(INET) || defined(INET6)
 static uint16_t
 ktls_get_cpu(struct socket *so)
 {
 	struct inpcb *inp;
 	uint16_t cpuid;
 
 	inp = sotoinpcb(so);
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid != NETISR_CPUID_NONE)
 		return (cpuid);
 #endif
 	/*
 	 * Just use the flowid to shard connections in a repeatable
 	 * fashion.  Note that some crypto backends rely on the
 	 * serialization provided by having the same connection use
 	 * the same queue.
 	 */
 	cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
 	return (cpuid);
 }
 #endif
 
 static void
 ktls_init(void *dummy __unused)
 {
 	struct thread *td;
 	struct pcpu *pc;
 	cpuset_t mask;
 	int error, i;
 
 	ktls_tasks_active = counter_u64_alloc(M_WAITOK);
 	ktls_cnt_on = counter_u64_alloc(M_WAITOK);
 	ktls_offload_total = counter_u64_alloc(M_WAITOK);
 	ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK);
 	ktls_offload_active = counter_u64_alloc(M_WAITOK);
 	ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK);
 	ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK);
 	ktls_switch_to_sw = counter_u64_alloc(M_WAITOK);
 	ktls_switch_failed = counter_u64_alloc(M_WAITOK);
 	ktls_sw_cbc = counter_u64_alloc(M_WAITOK);
 	ktls_sw_gcm = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_cbc = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_gcm = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset_dropped = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset_failed = counter_u64_alloc(M_WAITOK);
 
 	rm_init(&ktls_backends_lock, "ktls backends");
 	LIST_INIT(&ktls_backends);
 
 	ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
 	    M_WAITOK | M_ZERO);
 
 	ktls_session_zone = uma_zcreate("ktls_session",
 	    sizeof(struct ktls_session),
 #ifdef INVARIANTS
 	    trash_ctor, trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_CACHE, 0);
 
 	/*
 	 * Initialize the workqueues to run the TLS work.  We create a
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
 		STAILQ_INIT(&ktls_wq[i].head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
 		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
-		    &ktls_proc, &td, 0, 0, "KTLS", "ktls_thr_%d", i);
+		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
 		if (error)
 			panic("Can't add KTLS thread %d error %d", i, error);
 
 		/*
 		 * Bind threads to cores.  If ktls_bind_threads is >
 		 * 1, then we bind to the NUMA domain.
 		 */
 		if (ktls_bind_threads) {
 			if (ktls_bind_threads > 1) {
 				pc = pcpu_find(i);
 				CPU_COPY(&cpuset_domain[pc->pc_domain], &mask);
 			} else {
 				CPU_SETOF(i, &mask);
 			}
 			error = cpuset_setthread(td->td_tid, &mask);
 			if (error)
 				panic(
 			    "Unable to bind KTLS thread for CPU %d error %d",
 				     i, error);
 		}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
 	printf("KTLS: Initialized %d threads\n", ktls_number_threads);
 }
 SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
 
 #if defined(INET) || defined(INET6)
 static int
 ktls_create_session(struct socket *so, struct tls_enable *en,
     struct ktls_session **tlsp)
 {
 	struct ktls_session *tls;
 	int error;
 
 	/* Only TLS 1.0 - 1.2 are supported. */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
 		return (EINVAL);
 	if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
 	    en->tls_vminor > TLS_MINOR_VER_TWO)
 		return (EINVAL);
 
 	if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->iv_len < 0 || en->iv_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 
 	/* All supported algorithms require a cipher key. */
 	if (en->cipher_key_len == 0)
 		return (EINVAL);
 
 	/* No flags are currently supported. */
 	if (en->flags != 0)
 		return (EINVAL);
 
 	/* Common checks for supported algorithms. */
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * auth_algorithm isn't used, but permit GMAC values
 		 * for compatibility.
 		 */
 		switch (en->auth_algorithm) {
 		case 0:
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len != 0)
 			return (EINVAL);
 		if (en->iv_len != TLS_AEAD_GCM_LEN)
 			return (EINVAL);
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			/*
 			 * TLS 1.0 requires an implicit IV.  TLS 1.1+
 			 * all use explicit IVs.
 			 */
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
 					return (EINVAL);
 				break;
 			}
 
 			/* FALLTHROUGH */
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			/* Ignore any supplied IV. */
 			en->iv_len = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len == 0)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls->refcount, 1);
 	TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
 
 	tls->wq_index = ktls_get_cpu(so);
 
 	tls->params.cipher_algorithm = en->cipher_algorithm;
 	tls->params.auth_algorithm = en->auth_algorithm;
 	tls->params.tls_vmajor = en->tls_vmajor;
 	tls->params.tls_vminor = en->tls_vminor;
 	tls->params.flags = en->flags;
 	tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
 
 	/* Set the header and trailer lengths. */
 	tls->params.tls_hlen = sizeof(struct tls_record_layer);
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		tls->params.tls_hlen += 8;
 		tls->params.tls_tlen = AES_GMAC_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				/* Implicit IV, no nonce. */
 			} else {
 				tls->params.tls_hlen += AES_BLOCK_LEN;
 			}
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA1_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_256_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_384_HASH_LEN;
 			break;
 		default:
 			panic("invalid hmac");
 		}
 		tls->params.tls_bs = AES_BLOCK_LEN;
 		break;
 	default:
 		panic("invalid cipher");
 	}
 
 	KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
 	    ("TLS header length too long: %d", tls->params.tls_hlen));
 	KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
 	    ("TLS trailer length too long: %d", tls->params.tls_tlen));
 
 	if (en->auth_key_len != 0) {
 		tls->params.auth_key_len = en->auth_key_len;
 		tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
 		    M_WAITOK);
 		error = copyin(en->auth_key, tls->params.auth_key,
 		    en->auth_key_len);
 		if (error)
 			goto out;
 	}
 
 	tls->params.cipher_key_len = en->cipher_key_len;
 	tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
 	error = copyin(en->cipher_key, tls->params.cipher_key,
 	    en->cipher_key_len);
 	if (error)
 		goto out;
 
 	/*
 	 * This holds the implicit portion of the nonce for GCM and
 	 * the initial implicit IV for TLS 1.0.  The explicit portions
 	 * of the IV are generated in ktls_frame() and ktls_seq().
 	 */
 	if (en->iv_len != 0) {
 		MPASS(en->iv_len <= sizeof(tls->params.iv));
 		tls->params.iv_len = en->iv_len;
 		error = copyin(en->iv, tls->params.iv, en->iv_len);
 		if (error)
 			goto out;
 	}
 
 	*tlsp = tls;
 	return (0);
 
 out:
 	ktls_cleanup(tls);
 	return (error);
 }
 
 static struct ktls_session *
 ktls_clone_session(struct ktls_session *tls)
 {
 	struct ktls_session *tls_new;
 
 	tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls_new->refcount, 1);
 
 	/* Copy fields from existing session. */
 	tls_new->params = tls->params;
 	tls_new->wq_index = tls->wq_index;
 
 	/* Deep copy keys. */
 	if (tls_new->params.auth_key != NULL) {
 		tls_new->params.auth_key = malloc(tls->params.auth_key_len,
 		    M_KTLS, M_WAITOK);
 		memcpy(tls_new->params.auth_key, tls->params.auth_key,
 		    tls->params.auth_key_len);
 	}
 
 	tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
 	    M_WAITOK);
 	memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
 	    tls->params.cipher_key_len);
 
 	return (tls_new);
 }
 #endif
 
 static void
 ktls_cleanup(struct ktls_session *tls)
 {
 
 	counter_u64_add(ktls_offload_active, -1);
 	if (tls->free != NULL) {
 		MPASS(tls->be != NULL);
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_sw_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_sw_gcm, -1);
 			break;
 		}
 		tls->free(tls);
 	} else if (tls->snd_tag != NULL) {
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, -1);
 			break;
 		}
 		m_snd_tag_rele(tls->snd_tag);
 	}
 	if (tls->params.auth_key != NULL) {
 		explicit_bzero(tls->params.auth_key, tls->params.auth_key_len);
 		free(tls->params.auth_key, M_KTLS);
 		tls->params.auth_key = NULL;
 		tls->params.auth_key_len = 0;
 	}
 	if (tls->params.cipher_key != NULL) {
 		explicit_bzero(tls->params.cipher_key,
 		    tls->params.cipher_key_len);
 		free(tls->params.cipher_key, M_KTLS);
 		tls->params.cipher_key = NULL;
 		tls->params.cipher_key_len = 0;
 	}
 	explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 }
 
 #if defined(INET) || defined(INET6)
 /*
  * Common code used when first enabling ifnet TLS on a connection or
  * when allocating a new ifnet TLS session due to a routing change.
  * This function allocates a new TLS send tag on whatever interface
  * the connection is currently routed over.
  */
 static int
 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct rtentry *rt;
 	struct tcpcb *tp;
 	int error;
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 *
 	 * - Always permit 'force' requests.
 	 * - ktls_ifnet_permitted == 0: always deny.
 	 */
 	if (!force && ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: Use the cached route in the inpcb to find the
 	 * interface.  This should perhaps instead use
 	 * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
 	 * enabled after a connection has completed key negotiation in
 	 * userland, the cached route will be present in practice.
 	 */
 	rt = inp->inp_route.ro_rt;
 	if (rt == NULL || rt->rt_ifp == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = rt->rt_ifp;
 	if_ref(ifp);
 
 	params.hdr.type = IF_SND_TAG_TYPE_TLS;
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.tls.inp = inp;
 	params.tls.tls = tls;
 	INP_RUNLOCK(inp);
 
 	if (ifp->if_snd_tag_alloc == NULL) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {	
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = ifp->if_snd_tag_alloc(ifp, &params, mstp);
 out:
 	if_rele(ifp);
 	return (error);
 }
 
 static int
 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force)
 {
 	struct m_snd_tag *mst;
 	int error;
 
 	error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
 	if (error == 0) {
 		tls->snd_tag = mst;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, 1);
 			break;
 		}
 	}
 	return (error);
 }
 
 static int
 ktls_try_sw(struct socket *so, struct ktls_session *tls)
 {
 	struct rm_priotracker prio;
 	struct ktls_crypto_backend *be;
 
 	/*
 	 * Choose the best software crypto backend.  Backends are
 	 * stored in sorted priority order (larget value == most
 	 * important at the head of the list), so this just stops on
 	 * the first backend that claims the session by returning
 	 * success.
 	 */
 	if (ktls_allow_unload)
 		rm_rlock(&ktls_backends_lock, &prio);
 	LIST_FOREACH(be, &ktls_backends, next) {
 		if (be->try(so, tls) == 0)
 			break;
 		KASSERT(tls->cipher == NULL,
 		    ("ktls backend leaked a cipher pointer"));
 	}
 	if (be != NULL) {
 		if (ktls_allow_unload)
 			be->use_count++;
 		tls->be = be;
 	}
 	if (ktls_allow_unload)
 		rm_runlock(&ktls_backends_lock, &prio);
 	if (be == NULL)
 		return (EOPNOTSUPP);
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_sw_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_sw_gcm, 1);
 		break;
 	}
 	return (0);
 }
 
 int
 ktls_enable_tx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_snd.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS requires ext pgs */
 	if (mb_use_ext_pgs == 0)
 		return (ENXIO);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 	/* Prefer ifnet TLS over software TLS. */
 	error = ktls_try_ifnet(so, tls, false);
 	if (error)
 		error = ktls_try_sw(so, tls);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	error = sblock(&so->so_snd, SBL_WAIT);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls;
 	if (tls->sw_encrypt == NULL)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	sbunlock(&so->so_snd);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_get_tx_mode(struct socket *so)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int mode;
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL)
 		mode = TCP_TLS_MODE_NONE;
 	else if (tls->sw_encrypt != NULL)
 		mode = TCP_TLS_MODE_SW;
 	else
 		mode = TCP_TLS_MODE_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (mode);
 }
 
 /*
  * Switch between SW and ifnet TLS sessions as requested.
  */
 int
 ktls_set_tx_mode(struct socket *so, int mode)
 {
 	struct ktls_session *tls, *tls_new;
 	struct inpcb *inp;
 	int error;
 
 	MPASS(mode == TCP_TLS_MODE_SW || mode == TCP_TLS_MODE_IFNET);
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	if ((tls->sw_encrypt != NULL && mode == TCP_TLS_MODE_SW) ||
 	    (tls->sw_encrypt == NULL && mode == TCP_TLS_MODE_IFNET)) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	tls = ktls_hold(tls);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 
 	tls_new = ktls_clone_session(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		error = ktls_try_ifnet(so, tls_new, true);
 	else
 		error = ktls_try_sw(so, tls_new);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	error = sblock(&so->so_snd, SBL_WAIT);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	/*
 	 * If we raced with another session change, keep the existing
 	 * session.
 	 */
 	if (tls != so->so_snd.sb_tls_info) {
 		counter_u64_add(ktls_switch_failed, 1);
 		sbunlock(&so->so_snd);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (EBUSY);
 	}
 
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls_new;
 	if (tls_new->sw_encrypt == NULL)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	sbunlock(&so->so_snd);
 
 	/*
 	 * Drop two references on 'tls'.  The first is for the
 	 * ktls_hold() above.  The second drops the reference from the
 	 * socket buffer.
 	 */
 	KASSERT(tls->refcount >= 2, ("too few references on old session"));
 	ktls_free(tls);
 	ktls_free(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		counter_u64_add(ktls_switch_to_ifnet, 1);
 	else
 		counter_u64_add(ktls_switch_to_sw, 1);
 
 	INP_WLOCK(inp);
 	return (0);
 }
 
 /*
  * Try to allocate a new TLS send tag.  This task is scheduled when
  * ip_output detects a route change while trying to transmit a packet
  * holding a TLS record.  If a new tag is allocated, replace the tag
  * in the TLS session.  Subsequent packets on the connection will use
  * the new tag.  If a new tag cannot be allocated, drop the
  * connection.
  */
 static void
 ktls_reset_send_tag(void *context, int pending)
 {
 	struct epoch_tracker et;
 	struct ktls_session *tls;
 	struct m_snd_tag *old, *new;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	inp = tls->inp;
 
 	/*
 	 * Free the old tag first before allocating a new one.
 	 * ip[6]_output_send() will treat a NULL send tag the same as
 	 * an ifp mismatch and drop packets until a new tag is
 	 * allocated.
 	 *
 	 * Write-lock the INP when changing tls->snd_tag since
 	 * ip[6]_output_send() holds a read-lock when reading the
 	 * pointer.
 	 */
 	INP_WLOCK(inp);
 	old = tls->snd_tag;
 	tls->snd_tag = NULL;
 	INP_WUNLOCK(inp);
 	if (old != NULL)
 		m_snd_tag_rele(old);
 
 	error = ktls_alloc_snd_tag(inp, tls, true, &new);
 
 	if (error == 0) {
 		INP_WLOCK(inp);
 		tls->snd_tag = new;
 		mtx_pool_lock(mtxpool_sleep, tls);
 		tls->reset_pending = false;
 		mtx_pool_unlock(mtxpool_sleep, tls);
 		if (!in_pcbrele_wlocked(inp))
 			INP_WUNLOCK(inp);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 
 		/*
 		 * XXX: Should we kick tcp_output explicitly now that
 		 * the send tag is fixed or just rely on timers?
 		 */
 	} else {
 		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 		INP_WLOCK(inp);
 		if (!in_pcbrele_wlocked(inp)) {
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    !(inp->inp_flags & INP_DROPPED)) {
 				tp = intotcpcb(inp);
 				tp = tcp_drop(tp, ECONNABORTED);
 				if (tp != NULL)
 					INP_WUNLOCK(inp);
 				counter_u64_add(ktls_ifnet_reset_dropped, 1);
 			} else
 				INP_WUNLOCK(inp);
 		}
 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 
 		/*
 		 * Leave reset_pending true to avoid future tasks while
 		 * the socket goes away.
 		 */
 	}
 
 	ktls_free(tls);
 }
 
 int
 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 {
 
 	if (inp == NULL)
 		return (ENOBUFS);
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * See if we should schedule a task to update the send tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		in_pcbref(inp);
 		tls->inp = inp;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 	return (ENOBUFS);
 }
 #endif
 
 void
 ktls_destroy(struct ktls_session *tls)
 {
 	struct rm_priotracker prio;
 
 	ktls_cleanup(tls);
 	if (tls->be != NULL && ktls_allow_unload) {
 		rm_rlock(&ktls_backends_lock, &prio);
 		tls->be->use_count--;
 		rm_runlock(&ktls_backends_lock, &prio);
 	}
 	uma_zfree(ktls_session_zone, tls);
 }
 
 void
 ktls_seq(struct sockbuf *sb, struct mbuf *m)
 {
 	struct mbuf_ext_pgs *pgs;
 	struct tls_record_layer *tlshdr;
 	uint64_t seqno;
 
 	for (; m != NULL; m = m->m_next) {
 		KASSERT((m->m_flags & M_NOMAP) != 0,
 		    ("ktls_seq: mapped mbuf %p", m));
 
 		pgs = m->m_ext.ext_pgs;
 		pgs->seqno = sb->sb_tls_seqno;
 
 		/*
 		 * Store the sequence number in the TLS header as the
 		 * explicit part of the IV for GCM.
 		 */
 		if (pgs->tls->params.cipher_algorithm ==
 		    CRYPTO_AES_NIST_GCM_16) {
 			tlshdr = (void *)pgs->hdr;
 			seqno = htobe64(pgs->seqno);
 			memcpy(tlshdr + 1, &seqno, sizeof(seqno));
 		}
 		sb->sb_tls_seqno++;
 	}
 }
 
 /*
  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
  * mbuf in the chain must be an unmapped mbuf.  The payload of the
  * mbuf must be populated with the payload of each TLS record.
  *
  * The record_type argument specifies the TLS record type used when
  * populating the TLS header.
  *
  * The enq_count argument on return is set to the number of pages of
  * payload data for this entire chain that need to be encrypted via SW
  * encryption.  The returned value should be passed to ktls_enqueue
  * when scheduling encryption of this chain of mbufs.
  */
 int
 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
     uint8_t record_type)
 {
 	struct tls_record_layer *tlshdr;
 	struct mbuf *m;
 	struct mbuf_ext_pgs *pgs;
 	uint16_t tls_len;
 	int maxlen;
 
 	maxlen = tls->params.max_frame_len;
 	*enq_cnt = 0;
 	for (m = top; m != NULL; m = m->m_next) {
 		/*
 		 * All mbufs in the chain should be non-empty TLS
 		 * records whose payload does not exceed the maximum
 		 * frame length.
 		 */
 		if (m->m_len > maxlen || m->m_len == 0)
 			return (EINVAL);
 		tls_len = m->m_len;
 
 		/*
 		 * TLS frames require unmapped mbufs to store session
 		 * info.
 		 */
 		KASSERT((m->m_flags & M_NOMAP) != 0,
 		    ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top));
 
 		pgs = m->m_ext.ext_pgs;
 
 		/* Save a reference to the session. */
 		pgs->tls = ktls_hold(tls);
 
 		pgs->hdr_len = tls->params.tls_hlen;
 		pgs->trail_len = tls->params.tls_tlen;
 		if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
 			int bs, delta;
 
 			/*
 			 * AES-CBC pads messages to a multiple of the
 			 * block size.  Note that the padding is
 			 * applied after the digest and the encryption
 			 * is done on the "plaintext || mac || padding".
 			 * At least one byte of padding is always
 			 * present.
 			 *
 			 * Compute the final trailer length assuming
 			 * at most one block of padding.
 			 * tls->params.sb_tls_tlen is the maximum
 			 * possible trailer length (padding + digest).
 			 * delta holds the number of excess padding
 			 * bytes if the maximum were used.  Those
 			 * extra bytes are removed.
 			 */
 			bs = tls->params.tls_bs;
 			delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
 			pgs->trail_len -= delta;
 		}
 		m->m_len += pgs->hdr_len + pgs->trail_len;
 
 		/* Populate the TLS header. */
 		tlshdr = (void *)pgs->hdr;
 		tlshdr->tls_vmajor = tls->params.tls_vmajor;
 		tlshdr->tls_vminor = tls->params.tls_vminor;
 		tlshdr->tls_type = record_type;
 		tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 
 		/*
 		 * For GCM, the sequence number is stored in the
 		 * header by ktls_seq().  For CBC, a random nonce is
 		 * inserted for TLS 1.1+.
 		 */
 		if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 		    tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
 			arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
 
 		/*
 		 * When using SW encryption, mark the mbuf not ready.
 		 * It will be marked ready via sbready() after the
 		 * record has been encrypted.
 		 *
 		 * When using ifnet TLS, unencrypted TLS records are
 		 * sent down the stack to the NIC.
 		 */
 		if (tls->sw_encrypt != NULL) {
 			m->m_flags |= M_NOTREADY;
 			pgs->nrdy = pgs->npgs;
 			*enq_cnt += pgs->npgs;
 		}
 	}
 	return (0);
 }
 
 void
 ktls_enqueue_to_free(struct mbuf_ext_pgs *pgs)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	/* Mark it for freeing. */
 	pgs->mbuf = NULL;
 	wq = &ktls_wq[pgs->tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->head, pgs, stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 }
 
 void
 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 {
 	struct mbuf_ext_pgs *pgs;
 	struct ktls_wq *wq;
 	bool running;
 
 	KASSERT(((m->m_flags & (M_NOMAP | M_NOTREADY)) ==
 	    (M_NOMAP | M_NOTREADY)),
 	    ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
 	KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
 
 	pgs = m->m_ext.ext_pgs;
 
 	KASSERT(pgs->tls->sw_encrypt != NULL, ("ifnet TLS mbuf"));
 
 	pgs->enc_cnt = page_count;
 	pgs->mbuf = m;
 
 	/*
 	 * Save a pointer to the socket.  The caller is responsible
 	 * for taking an additional reference via soref().
 	 */
 	pgs->so = so;
 
 	wq = &ktls_wq[pgs->tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->head, pgs, stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_on, 1);
 }
 
 static __noinline void
 ktls_encrypt(struct mbuf_ext_pgs *pgs)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m, *top;
 	vm_paddr_t parray[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	struct iovec src_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	struct iovec dst_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	vm_page_t pg;
 	int error, i, len, npages, off, total_pages;
 	bool is_anon;
 
 	so = pgs->so;
 	tls = pgs->tls;
 	top = pgs->mbuf;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p, pgs = %p\n", top, pgs));
 	KASSERT(so != NULL, ("so = NULL, top = %p, pgs = %p\n", top, pgs));
 #ifdef INVARIANTS
 	pgs->so = NULL;
 	pgs->mbuf = NULL;
 #endif
 	total_pages = pgs->enc_cnt;
 	npages = 0;
 
 	/*
 	 * Encrypt the TLS records in the chain of mbufs starting with
 	 * 'top'.  'total_pages' gives us a total count of pages and is
 	 * used to know when we have finished encrypting the TLS
 	 * records originally queued with 'top'.
 	 *
 	 * NB: These mbufs are queued in the socket buffer and
 	 * 'm_next' is traversing the mbufs in the socket buffer.  The
 	 * socket buffer lock is not held while traversing this chain.
 	 * Since the mbufs are all marked M_NOTREADY their 'm_next'
 	 * pointers should be stable.  However, the 'm_next' of the
 	 * last mbuf encrypted is not necessarily NULL.  It can point
 	 * to other mbufs appended while 'top' was on the TLS work
 	 * queue.
 	 *
 	 * Each mbuf holds an entire TLS record.
 	 */
 	error = 0;
 	for (m = top; npages != total_pages; m = m->m_next) {
 		pgs = m->m_ext.ext_pgs;
 
 		KASSERT(pgs->tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, pgs->tls));
 		KASSERT((m->m_flags & (M_NOMAP | M_NOTREADY)) ==
 		    (M_NOMAP | M_NOTREADY),
 		    ("%p not unready & nomap mbuf (top = %p)\n", m, top));
 		KASSERT(npages + pgs->npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		/*
 		 * Generate source and destination ivoecs to pass to
 		 * the SW encryption backend.  For writable mbufs, the
 		 * destination iovec is a copy of the source and
 		 * encryption is done in place.  For file-backed mbufs
 		 * (from sendfile), anonymous wired pages are
 		 * allocated and assigned to the destination iovec.
 		 */
 		is_anon = M_WRITABLE(m);
 
 		off = pgs->first_pg_off;
 		for (i = 0; i < pgs->npgs; i++, off = 0) {
 			len = mbuf_ext_pg_len(pgs, i, off);
 			src_iov[i].iov_len = len;
 			src_iov[i].iov_base =
 			    (char *)(void *)PHYS_TO_DMAP(pgs->pa[i]) + off;
 
 			if (is_anon) {
 				dst_iov[i].iov_base = src_iov[i].iov_base;
 				dst_iov[i].iov_len = src_iov[i].iov_len;
 				continue;
 			}
 retry_page:
 			pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED);
 			if (pg == NULL) {
 				vm_wait(NULL);
 				goto retry_page;
 			}
 			parray[i] = VM_PAGE_TO_PHYS(pg);
 			dst_iov[i].iov_base =
 			    (char *)(void *)PHYS_TO_DMAP(parray[i]) + off;
 			dst_iov[i].iov_len = len;
 		}
 
 		npages += i;
 
 		error = (*tls->sw_encrypt)(tls,
 		    (const struct tls_record_layer *)pgs->hdr,
 		    pgs->trail, src_iov, dst_iov, i, pgs->seqno);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			break;
 		}
 
 		/*
 		 * For file-backed mbufs, release the file-backed
 		 * pages and replace them in the ext_pgs array with
 		 * the anonymous wired pages allocated above.
 		 */
 		if (!is_anon) {
 			/* Free the old pages. */
 			m->m_ext.ext_free(m);
 
 			/* Replace them with the new pages. */
 			for (i = 0; i < pgs->npgs; i++)
 				pgs->pa[i] = parray[i];
 
 			/* Use the basic free routine. */
 			m->m_ext.ext_free = mb_free_mext_pgs;
 		}
 
 		/*
 		 * Drop a reference to the session now that it is no
 		 * longer needed.  Existing code depends on encrypted
 		 * records having no associated session vs
 		 * yet-to-be-encrypted records having an associated
 		 * session.
 		 */
 		pgs->tls = NULL;
 		ktls_free(tls);
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error == 0) {
 		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
 	} else {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(top, total_pages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static void
 ktls_work_thread(void *ctx)
 {
 	struct ktls_wq *wq = ctx;
 	struct mbuf_ext_pgs *p, *n;
 	struct ktls_session *tls;
 	STAILQ_HEAD(, mbuf_ext_pgs) local_head;
 
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);
 #endif
 	for (;;) {
 		mtx_lock(&wq->mtx);
 		while (STAILQ_EMPTY(&wq->head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 
 		STAILQ_INIT(&local_head);
 		STAILQ_CONCAT(&local_head, &wq->head);
 		mtx_unlock(&wq->mtx);
 
 		STAILQ_FOREACH_SAFE(p, &local_head, stailq, n) {
 			if (p->mbuf != NULL) {
 				ktls_encrypt(p);
 				counter_u64_add(ktls_cnt_on, -1);
 			} else {
 				tls = p->tls;
 				ktls_free(tls);
 				uma_zfree(zone_extpgs, p);
 			}
 		}
 	}
 }
Index: projects/clang900-import/sys/kern/vfs_mount.c
===================================================================
--- projects/clang900-import/sys/kern/vfs_mount.c	(revision 352586)
+++ projects/clang900-import/sys/kern/vfs_mount.c	(revision 352587)
@@ -1,2282 +1,2282 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/smp.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/sbuf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
 
 static int	vfs_domount(struct thread *td, const char *fstype, char *fspath,
 		    uint64_t fsflags, struct vfsoptlist **optlist);
 static void	free_mntarg(struct mntarg *ma);
 
 static int	usermount = 0;
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
     "Unprivileged users may mount and unmount file systems");
 
 static bool	default_autoro = false;
 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
     "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
 
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
 static uma_zone_t mount_zone;
 
 /* List of mounted filesystems. */
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 
 /* For any iteration/modification of mountlist */
 struct mtx mountlist_mtx;
 MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
 
 EVENTHANDLER_LIST_DEFINE(vfs_mounted);
 EVENTHANDLER_LIST_DEFINE(vfs_unmounted);
 
 /*
  * Global opts, taken by all filesystems
  */
 static const char *global_opts[] = {
 	"errmsg",
 	"fstype",
 	"fspath",
 	"ro",
 	"rw",
 	"nosuid",
 	"noexec",
 	NULL
 };
 
 static int
 mount_init(void *mem, int size, int flags)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
 	mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
 	    M_WAITOK | M_ZERO);
 	mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
 	    M_WAITOK | M_ZERO);
 	mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
 	    M_WAITOK | M_ZERO);
 	mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
 	    M_WAITOK | M_ZERO);
 	mp->mnt_ref = 0;
 	mp->mnt_vfs_ops = 1;
 	return (0);
 }
 
 static void
 mount_fini(void *mem, int size)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	uma_zfree_pcpu(pcpu_zone_int, mp->mnt_writeopcount_pcpu);
 	uma_zfree_pcpu(pcpu_zone_int, mp->mnt_lockref_pcpu);
 	uma_zfree_pcpu(pcpu_zone_int, mp->mnt_ref_pcpu);
 	uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu);
 	lockdestroy(&mp->mnt_explock);
 	mtx_destroy(&mp->mnt_listmtx);
 	mtx_destroy(&mp->mnt_mtx);
 }
 
 static void
 vfs_mount_init(void *dummy __unused)
 {
 
 	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
-	    NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	    NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 }
 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
 
 /*
  * ---------------------------------------------------------------------
  * Functions for building and sanitizing the mount options
  */
 
 /* Remove one mount option. */
 static void
 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 {
 
 	TAILQ_REMOVE(opts, opt, link);
 	free(opt->name, M_MOUNT);
 	if (opt->value != NULL)
 		free(opt->value, M_MOUNT);
 	free(opt, M_MOUNT);
 }
 
 /* Release all resources related to the mount options. */
 void
 vfs_freeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
 
 	while (!TAILQ_EMPTY(opts)) {
 		opt = TAILQ_FIRST(opts);
 		vfs_freeopt(opts, opt);
 	}
 	free(opts, M_MOUNT);
 }
 
 void
 vfs_deleteopt(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt, *temp;
 
 	if (opts == NULL)
 		return;
 	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
 		if (strcmp(opt->name, name) == 0)
 			vfs_freeopt(opts, opt);
 	}
 }
 
 static int
 vfs_isopt_ro(const char *opt)
 {
 
 	if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
 	    strcmp(opt, "norw") == 0)
 		return (1);
 	return (0);
 }
 
 static int
 vfs_isopt_rw(const char *opt)
 {
 
 	if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
 		return (1);
 	return (0);
 }
 
 /*
  * Check if options are equal (with or without the "no" prefix).
  */
 static int
 vfs_equalopts(const char *opt1, const char *opt2)
 {
 	char *p;
 
 	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
 	if (strcmp(opt1, opt2) == 0)
 		return (1);
 	/* "noopt" vs. "opt" */
 	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 		return (1);
 	/* "opt" vs. "noopt" */
 	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 		return (1);
 	while ((p = strchr(opt1, '.')) != NULL &&
 	    !strncmp(opt1, opt2, ++p - opt1)) {
 		opt2 += p - opt1;
 		opt1 = p;
 		/* "foo.noopt" vs. "foo.opt" */
 		if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 			return (1);
 		/* "foo.opt" vs. "foo.noopt" */
 		if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 			return (1);
 	}
 	/* "ro" / "rdonly" / "norw" / "rw" / "noro" */
 	if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
 	    (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
 		return (1);
 	return (0);
 }
 
 /*
  * If a mount option is specified several times,
  * (with or without the "no" prefix) only keep
  * the last occurrence of it.
  */
 static void
 vfs_sanitizeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *tmp;
 
 	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 		while (opt2 != NULL) {
 			if (vfs_equalopts(opt->name, opt2->name)) {
 				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 				vfs_freeopt(opts, opt2);
 				opt2 = tmp;
 			} else {
 				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 			}
 		}
 	}
 }
 
 /*
  * Build a linked list of mount options from a struct uio.
  */
 int
 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 {
 	struct vfsoptlist *opts;
 	struct vfsopt *opt;
 	size_t memused, namelen, optlen;
 	unsigned int i, iovcnt;
 	int error;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	memused = 0;
 	iovcnt = auio->uio_iovcnt;
 	for (i = 0; i < iovcnt; i += 2) {
 		namelen = auio->uio_iov[i].iov_len;
 		optlen = auio->uio_iov[i + 1].iov_len;
 		memused += sizeof(struct vfsopt) + optlen + namelen;
 		/*
 		 * Avoid consuming too much memory, and attempts to overflow
 		 * memused.
 		 */
 		if (memused > VFS_MOUNTARG_SIZE_MAX ||
 		    optlen > VFS_MOUNTARG_SIZE_MAX ||
 		    namelen > VFS_MOUNTARG_SIZE_MAX) {
 			error = EINVAL;
 			goto bad;
 		}
 
 		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 		opt->value = NULL;
 		opt->len = 0;
 		opt->pos = i / 2;
 		opt->seen = 0;
 
 		/*
 		 * Do this early, so jumps to "bad" will free the current
 		 * option.
 		 */
 		TAILQ_INSERT_TAIL(opts, opt, link);
 
 		if (auio->uio_segflg == UIO_SYSSPACE) {
 			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 		} else {
 			error = copyin(auio->uio_iov[i].iov_base, opt->name,
 			    namelen);
 			if (error)
 				goto bad;
 		}
 		/* Ensure names are null-terminated strings. */
 		if (namelen == 0 || opt->name[namelen - 1] != '\0') {
 			error = EINVAL;
 			goto bad;
 		}
 		if (optlen != 0) {
 			opt->len = optlen;
 			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 			if (auio->uio_segflg == UIO_SYSSPACE) {
 				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 				    optlen);
 			} else {
 				error = copyin(auio->uio_iov[i + 1].iov_base,
 				    opt->value, optlen);
 				if (error)
 					goto bad;
 			}
 		}
 	}
 	vfs_sanitizeopts(opts);
 	*options = opts;
 	return (0);
 bad:
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Merge the old mount options with the new ones passed
  * in the MNT_UPDATE case.
  *
  * XXX: This function will keep a "nofoo" option in the new
  * options.  E.g, if the option's canonical name is "foo",
  * "nofoo" ends up in the mount point's active options.
  */
 static void
 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
 {
 	struct vfsopt *opt, *new;
 
 	TAILQ_FOREACH(opt, oldopts, link) {
 		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		new->name = strdup(opt->name, M_MOUNT);
 		if (opt->len != 0) {
 			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 			bcopy(opt->value, new->value, opt->len);
 		} else
 			new->value = NULL;
 		new->len = opt->len;
 		new->seen = opt->seen;
 		TAILQ_INSERT_HEAD(toopts, new, link);
 	}
 	vfs_sanitizeopts(toopts);
 }
 
 /*
  * Mount a filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nmount_args {
 	struct iovec *iovp;
 	unsigned int iovcnt;
 	int flags;
 };
 #endif
 int
 sys_nmount(struct thread *td, struct nmount_args *uap)
 {
 	struct uio *auio;
 	int error;
 	u_int iovcnt;
 	uint64_t flags;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit archtectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
 	    uap->iovp, uap->iovcnt, flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	iovcnt = uap->iovcnt;
 	/*
 	 * Check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((iovcnt & 1) || (iovcnt < 4)) {
 		CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
 		    uap->iovcnt);
 		return (EINVAL);
 	}
 
 	error = copyinuio(uap->iovp, iovcnt, &auio);
 	if (error) {
 		CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
 		    __func__, error);
 		return (error);
 	}
 	error = vfs_donmount(td, flags, auio);
 
 	free(auio, M_IOV);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Various utility functions
  */
 
 void
 vfs_ref(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	if (vfs_op_thread_enter(mp)) {
 		vfs_mp_count_add_pcpu(mp, ref, 1);
 		vfs_op_thread_exit(mp);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 }
 
 void
 vfs_rel(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	if (vfs_op_thread_enter(mp)) {
 		vfs_mp_count_sub_pcpu(mp, ref, 1);
 		vfs_op_thread_exit(mp);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Allocate and initialize the mount point struct.
  */
 struct mount *
 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
     struct ucred *cred)
 {
 	struct mount *mp;
 
 	mp = uma_zalloc(mount_zone, M_WAITOK);
 	bzero(&mp->mnt_startzero,
 	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
 	TAILQ_INIT(&mp->mnt_activevnodelist);
 	mp->mnt_activevnodelistsize = 0;
 	TAILQ_INIT(&mp->mnt_tmpfreevnodelist);
 	mp->mnt_tmpfreevnodelistsize = 0;
 	if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 ||
 	    mp->mnt_writeopcount != 0)
 		panic("%s: non-zero counters on new mp %p\n", __func__, mp);
 	if (mp->mnt_vfs_ops != 1)
 		panic("%s: vfs_ops should be 1 but %d found\n", __func__,
 		    mp->mnt_vfs_ops);
 	(void) vfs_busy(mp, MBF_NOWAIT);
 	atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_gen++;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_cred = crdup(cred);
 	mp->mnt_stat.f_owner = cred->cr_uid;
 	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_iosize_max = DFLTPHYS;
 #ifdef MAC
 	mac_mount_init(mp);
 	mac_mount_create(cred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 	TAILQ_INIT(&mp->mnt_uppers);
 	return (mp);
 }
 
 /*
  * Destroy the mount struct previously allocated by vfs_mount_alloc().
  */
 void
 vfs_mount_destroy(struct mount *mp)
 {
 
 	if (mp->mnt_vfs_ops == 0)
 		panic("%s: entered with zero vfs_ops\n", __func__);
 
 	vfs_assert_mount_counters(mp);
 
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
 	if (mp->mnt_kern_flag & MNTK_MWAIT) {
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
 	while (mp->mnt_ref)
 		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
 	KASSERT(mp->mnt_ref == 0,
 	    ("%s: invalid refcount in the drain path @ %s:%d", __func__,
 	    __FILE__, __LINE__));
 	if (mp->mnt_writeopcount != 0)
 		panic("vfs_mount_destroy: nonzero writeopcount");
 	if (mp->mnt_secondary_writes != 0)
 		panic("vfs_mount_destroy: nonzero secondary_writes");
 	atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 		struct vnode *vp;
 
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 			vn_printf(vp, "dangling vnode ");
 		panic("unmount: dangling vnode");
 	}
 	KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
 	if (mp->mnt_nvnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero nvnodelistsize");
 	if (mp->mnt_activevnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero activevnodelistsize");
 	if (mp->mnt_tmpfreevnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize");
 	if (mp->mnt_lockref != 0)
 		panic("vfs_mount_destroy: nonzero lock refcount");
 	MNT_IUNLOCK(mp);
 
 	if (mp->mnt_vfs_ops != 1)
 		panic("%s: vfs_ops should be 1 but %d found\n", __func__,
 		    mp->mnt_vfs_ops);
 
 	if (mp->mnt_vnodecovered != NULL)
 		vrele(mp->mnt_vnodecovered);
 #ifdef MAC
 	mac_mount_destroy(mp);
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	crfree(mp->mnt_cred);
 	uma_zfree(mount_zone, mp);
 }
 
 static bool
 vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
 {
 	/* This is an upgrade of an exisiting mount. */
 	if ((fsflags & MNT_UPDATE) != 0)
 		return (false);
 	/* This is already an R/O mount. */
 	if ((fsflags & MNT_RDONLY) != 0)
 		return (false);
 
 	switch (error) {
 	case ENODEV:	/* generic, geom, ... */
 	case EACCES:	/* cam/scsi, ... */
 	case EROFS:	/* md, mmcsd, ... */
 		/*
 		 * These errors can be returned by the storage layer to signal
 		 * that the media is read-only.  No harm in the R/O mount
 		 * attempt if the error was returned for some other reason.
 		 */
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 int
 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
 {
 	struct vfsoptlist *optlist;
 	struct vfsopt *opt, *tmp_opt;
 	char *fstype, *fspath, *errmsg;
 	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 	bool autoro;
 
 	errmsg = fspath = NULL;
 	errmsg_len = fspathlen = 0;
 	errmsg_pos = -1;
 	autoro = default_autoro;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
 		return (error);
 
 	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
 		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 
 	/*
 	 * We need these two options before the others,
 	 * and they are mandatory for any filesystem.
 	 * Ensure they are NUL terminated as well.
 	 */
 	fstypelen = 0;
 	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 	if (error || fstype[fstypelen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 	fspathlen = 0;
 	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 	if (error || fspath[fspathlen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fspath", errmsg_len);
 		goto bail;
 	}
 
 	/*
 	 * We need to see if we have the "update" option
 	 * before we call vfs_domount(), since vfs_domount() has special
 	 * logic based on MNT_UPDATE.  This is very important
 	 * when we want to update the root filesystem.
 	 */
 	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
 		if (strcmp(opt->name, "update") == 0) {
 			fsflags |= MNT_UPDATE;
 			vfs_freeopt(optlist, opt);
 		}
 		else if (strcmp(opt->name, "async") == 0)
 			fsflags |= MNT_ASYNC;
 		else if (strcmp(opt->name, "force") == 0) {
 			fsflags |= MNT_FORCE;
 			vfs_freeopt(optlist, opt);
 		}
 		else if (strcmp(opt->name, "reload") == 0) {
 			fsflags |= MNT_RELOAD;
 			vfs_freeopt(optlist, opt);
 		}
 		else if (strcmp(opt->name, "multilabel") == 0)
 			fsflags |= MNT_MULTILABEL;
 		else if (strcmp(opt->name, "noasync") == 0)
 			fsflags &= ~MNT_ASYNC;
 		else if (strcmp(opt->name, "noatime") == 0)
 			fsflags |= MNT_NOATIME;
 		else if (strcmp(opt->name, "atime") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoatime", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterr") == 0)
 			fsflags |= MNT_NOCLUSTERR;
 		else if (strcmp(opt->name, "clusterr") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterr", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterw") == 0)
 			fsflags |= MNT_NOCLUSTERW;
 		else if (strcmp(opt->name, "clusterw") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterw", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noexec") == 0)
 			fsflags |= MNT_NOEXEC;
 		else if (strcmp(opt->name, "exec") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoexec", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosuid") == 0)
 			fsflags |= MNT_NOSUID;
 		else if (strcmp(opt->name, "suid") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosuid", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosymfollow") == 0)
 			fsflags |= MNT_NOSYMFOLLOW;
 		else if (strcmp(opt->name, "symfollow") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosymfollow", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noro") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "rw") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "ro") == 0) {
 			fsflags |= MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "rdonly") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("ro", M_MOUNT);
 			fsflags |= MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "autoro") == 0) {
 			vfs_freeopt(optlist, opt);
 			autoro = true;
 		}
 		else if (strcmp(opt->name, "suiddir") == 0)
 			fsflags |= MNT_SUIDDIR;
 		else if (strcmp(opt->name, "sync") == 0)
 			fsflags |= MNT_SYNCHRONOUS;
 		else if (strcmp(opt->name, "union") == 0)
 			fsflags |= MNT_UNION;
 		else if (strcmp(opt->name, "automounted") == 0) {
 			fsflags |= MNT_AUTOMOUNTED;
 			vfs_freeopt(optlist, opt);
 		}
 	}
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
 		error = ENAMETOOLONG;
 		goto bail;
 	}
 
 	error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
 
 	/*
 	 * See if we can mount in the read-only mode if the error code suggests
 	 * that it could be possible and the mount options allow for that.
 	 * Never try it if "[no]{ro|rw}" has been explicitly requested and not
 	 * overridden by "autoro".
 	 */
 	if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
 		printf("%s: R/W mount failed, possibly R/O media,"
 		    " trying R/O mount\n", __func__);
 		fsflags |= MNT_RDONLY;
 		error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
 	}
 bail:
 	/* copyout the errmsg */
 	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
 	    && errmsg_len > 0 && errmsg != NULL) {
 		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
 			bcopy(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		} else {
 			copyout(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		}
 	}
 
 	if (optlist != NULL)
 		vfs_freeopts(optlist);
 	return (error);
 }
 
 /*
  * Old mount API.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 sys_mount(struct thread *td, struct mount_args *uap)
 {
 	char *fstype;
 	struct vfsconf *vfsp = NULL;
 	struct mntarg *ma = NULL;
 	uint64_t flags;
 	int error;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit architectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
 	if (error) {
 		free(fstype, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_TEXT(fstype);
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	free(fstype, M_TEMP);
 	if (vfsp == NULL)
 		return (ENOENT);
 	if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 &&
 	    vfsp->vfc_vfsops_sd->vfs_cmount == NULL) ||
 	    ((vfsp->vfc_flags & VFCF_SBDRY) == 0 &&
 	    vfsp->vfc_vfsops->vfs_cmount == NULL))
 		return (EOPNOTSUPP);
 
 	ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
 	ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
 	ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
 	ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
 
 	if ((vfsp->vfc_flags & VFCF_SBDRY) != 0)
 		return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags));
 	return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags));
 }
 
 /*
  * vfs_domount_first(): first file system mount (not update)
  */
 static int
 vfs_domount_first(
 	struct thread *td,		/* Calling thread. */
 	struct vfsconf *vfsp,		/* File system type. */
 	char *fspath,			/* Mount path. */
 	struct vnode *vp,		/* Vnode to be covered. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct vattr va;
 	struct mount *mp;
 	struct vnode *newdp;
 	int error, error1;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
 
 	/*
 	 * If the jail of the calling thread lacks permission for this type of
 	 * file system, deny immediately.
 	 */
 	if (jailed(td->td_ucred) && !prison_allow(td->td_ucred,
 	    vfsp->vfc_prison_flag)) {
 		vput(vp);
 		return (EPERM);
 	}
 
 	/*
 	 * If the user is not root, ensure that they own the directory
 	 * onto which we are attempting to mount.
 	 */
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
 	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
 		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN);
 	if (error == 0)
 		error = vinvalbuf(vp, V_SAVE, 0, 0);
 	if (error == 0 && vp->v_type != VDIR)
 		error = ENOTDIR;
 	if (error == 0) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
 			vp->v_iflag |= VI_MOUNT;
 		else
 			error = EBUSY;
 		VI_UNLOCK(vp);
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	VOP_UNLOCK(vp, 0);
 
 	/* Allocate and initialize the filesystem. */
 	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
 	/* XXXMAC: pass to vfs_mount_alloc? */
 	mp->mnt_optnew = *optlist;
 	/* Set the mount level flags. */
 	mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
 
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error1 = 0;
 	if ((error = VFS_MOUNT(mp)) != 0 ||
 	    (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 ||
 	    (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) {
 		if (error1 != 0) {
 			error = error1;
 			if ((error1 = VFS_UNMOUNT(mp, 0)) != 0)
 				printf("VFS_UNMOUNT returned %d\n", error1);
 		}
 		vfs_unbusy(mp);
 		mp->mnt_vnodecovered = NULL;
 		vfs_mount_destroy(mp);
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vrele(vp);
 		return (error);
 	}
 	VOP_UNLOCK(newdp, 0);
 
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	*optlist = NULL;
 
 	/*
 	 * Prevent external consumers of mount options from reading mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	cache_purge(vp);
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp);
 	vp->v_mountedhere = mp;
 	/* Place the new filesystem at the end of the mount list. */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	vfs_event_signal(NULL, VQ_MOUNT, 0);
 	vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_UNLOCK(vp, 0);
 	EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
 	VOP_UNLOCK(newdp, 0);
 	mountcheckdirs(vp, newdp);
 	vrele(newdp);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
 	vfs_op_exit(mp);
 	vfs_unbusy(mp);
 	return (0);
 }
 
 /*
  * vfs_domount_update(): update of mounted file system
  */
 static int
 vfs_domount_update(
 	struct thread *td,		/* Calling thread. */
 	struct vnode *vp,		/* Mount point vnode. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct export_args export;
 	void *bufp;
 	struct mount *mp;
 	int error, export_error, len;
 	uint64_t flag;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
 	mp = vp->v_mount;
 
 	if ((vp->v_vflag & VV_ROOT) == 0) {
 		if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
 		    == 0)
 			error = EXDEV;
 		else
 			error = EINVAL;
 		vput(vp);
 		return (error);
 	}
 
 	/*
 	 * We only allow the filesystem to be reloaded if it
 	 * is currently mounted read-only.
 	 */
 	flag = mp->mnt_flag;
 	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
 		vput(vp);
 		return (EOPNOTSUPP);	/* Needs translation */
 	}
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that
 	 * did the original mount is permitted to update it.
 	 */
 	error = vfs_suser(mp, td);
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	if (vfs_busy(mp, MBF_NOWAIT)) {
 		vput(vp);
 		return (EBUSY);
 	}
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
 		VI_UNLOCK(vp);
 		vfs_unbusy(mp);
 		vput(vp);
 		return (EBUSY);
 	}
 	vp->v_iflag |= VI_MOUNT;
 	VI_UNLOCK(vp);
 	VOP_UNLOCK(vp, 0);
 
 	vfs_op_enter(mp);
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		MNT_IUNLOCK(mp);
 		error = EBUSY;
 		goto end;
 	}
 	mp->mnt_flag &= ~MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
 	    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
 	if ((mp->mnt_flag & MNT_ASYNC) == 0)
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	mp->mnt_optnew = *optlist;
 	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
 
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error = VFS_MOUNT(mp);
 
 	export_error = 0;
 	/* Process the export option. */
 	if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp,
 	    &len) == 0) {
 		/* Assume that there is only 1 ABI for each length. */
 		switch (len) {
 		case (sizeof(struct oexport_args)):
 			bzero(&export, sizeof(export));
 			/* FALLTHROUGH */
 		case (sizeof(export)):
 			bcopy(bufp, &export, len);
 			export_error = vfs_export(mp, &export);
 			break;
 		default:
 			export_error = EINVAL;
 			break;
 		}
 	}
 
 	MNT_ILOCK(mp);
 	if (error == 0) {
 		mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
 		    MNT_SNAPSHOT);
 	} else {
 		/*
 		 * If we fail, restore old mount flags. MNT_QUOTA is special,
 		 * because it is not part of MNT_UPDATEMASK, but it could have
 		 * changed in the meantime if quotactl(2) was called.
 		 * All in all we want current value of MNT_QUOTA, not the old
 		 * one.
 		 */
 		mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
 	}
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 
 	if (error != 0)
 		goto end;
 
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	*optlist = NULL;
 	(void)VFS_STATFS(mp, &mp->mnt_stat);
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
 	else
 		vfs_deallocate_syncvnode(mp);
 end:
 	vfs_op_exit(mp);
 	vfs_unbusy(mp);
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp);
 	vrele(vp);
 	return (error != 0 ? error : export_error);
 }
 
 /*
  * vfs_domount(): actually attempt a filesystem mount.
  */
 static int
 vfs_domount(
 	struct thread *td,		/* Calling thread. */
 	const char *fstype,		/* Filesystem type. */
 	char *fspath,			/* Mount path. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct vfsconf *vfsp;
 	struct nameidata nd;
 	struct vnode *vp;
 	char *pathbuf;
 	int error;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	if (jailed(td->td_ucred) || usermount == 0) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
 	 */
 	if (fsflags & MNT_EXPORTED) {
 		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
 		if (error)
 			return (error);
 	}
 	if (fsflags & MNT_SUIDDIR) {
 		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
 	 */
 	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
 		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
 			fsflags |= MNT_NOSUID | MNT_USER;
 	}
 
 	/* Load KLDs before we lock the covered vnode to avoid reversals. */
 	vfsp = NULL;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		/* Don't try to load KLDs if we're mounting the root. */
 		if (fsflags & MNT_ROOTFS)
 			vfsp = vfs_byname(fstype);
 		else
 			vfsp = vfs_byname_kld(fstype, td, &error);
 		if (vfsp == NULL)
 			return (ENODEV);
 	}
 
 	/*
 	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, fspath, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 		strcpy(pathbuf, fspath);
 		error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN);
 		/* debug.disablefullpath == 1 results in ENODEV */
 		if (error == 0 || error == ENODEV) {
 			error = vfs_domount_first(td, vfsp, pathbuf, vp,
 			    fsflags, optlist);
 		}
 		free(pathbuf, M_TEMP);
 	} else
 		error = vfs_domount_update(td, vp, fsflags, optlist);
 
 	return (error);
 }
 
 /*
  * Unmount a filesystem.
  *
  * Note: unmount takes a path to the vnode mounted on as argument, not
  * special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 sys_unmount(struct thread *td, struct unmount_args *uap)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	char *pathbuf;
 	int error, id0, id1;
 
 	AUDIT_ARG_VALUE(uap->flags);
 	if (jailed(td->td_ucred) || usermount == 0) {
 		error = priv_check(td, PRIV_VFS_UNMOUNT);
 		if (error)
 			return (error);
 	}
 
 	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
 	if (error) {
 		free(pathbuf, M_TEMP);
 		return (error);
 	}
 	if (uap->flags & MNT_BYFSID) {
 		AUDIT_ARG_TEXT(pathbuf);
 		/* Decode the filesystem ID. */
 		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
 			free(pathbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
 			    mp->mnt_stat.f_fsid.val[1] == id1) {
 				vfs_ref(mp);
 				break;
 			}
 		}
 		mtx_unlock(&mountlist_mtx);
 	} else {
 		/*
 		 * Try to find global path for path argument.
 		 */
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 		    UIO_SYSSPACE, pathbuf, td);
 		if (namei(&nd) == 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
 			    MNAMELEN);
 			if (error == 0 || error == ENODEV)
 				vput(nd.ni_vp);
 		}
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
 				vfs_ref(mp);
 				break;
 			}
 		}
 		mtx_unlock(&mountlist_mtx);
 	}
 	free(pathbuf, M_TEMP);
 	if (mp == NULL) {
 		/*
 		 * Previously we returned ENOENT for a nonexistent path and
 		 * EINVAL for a non-mountpoint.  We cannot tell these apart
 		 * now, so in the !MNT_BYFSID case return the more likely
 		 * EINVAL for compatibility.
 		 */
 		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
 	}
 
 	/*
 	 * Don't allow unmounting the root filesystem.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	error = dounmount(mp, uap->flags, td);
 	return (error);
 }
 
 /*
  * Return error if any of the vnodes, ignoring the root vnode
  * and the syncer vnode, have non-zero usecount.
  *
  * This function is purely advisory - it can return false positives
  * and negatives.
  */
 static int
 vfs_check_usecounts(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
 
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON &&
 		    vp->v_usecount != 0) {
 			VI_UNLOCK(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (EBUSY);
 		}
 		VI_UNLOCK(vp);
 	}
 
 	return (0);
 }
 
 static void
 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
 {
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 	mp->mnt_kern_flag &= ~mntkflags;
 	if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) {
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
 	vfs_op_exit_locked(mp);
 	MNT_IUNLOCK(mp);
 	if (coveredvp != NULL) {
 		VOP_UNLOCK(coveredvp, 0);
 		vdrop(coveredvp);
 	}
 	vn_finished_write(mp);
 }
 
 /*
  * There are various reference counters associated with the mount point.
  * Normally it is permitted to modify them without taking the mnt ilock,
  * but this behavior can be temporarily disabled if stable value is needed
  * or callers are expected to block (e.g. to not allow new users during
  * forced unmount).
  */
 void
 vfs_op_enter(struct mount *mp)
 {
 	int cpu;
 
 	MNT_ILOCK(mp);
 	mp->mnt_vfs_ops++;
 	if (mp->mnt_vfs_ops > 1) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	/*
 	 * Paired with a fence in vfs_op_thread_enter(). See the comment
 	 * above it for details.
 	 */
 	atomic_thread_fence_seq_cst();
 	vfs_op_barrier_wait(mp);
 	/*
 	 * Paired with a fence in vfs_op_thread_exit().
 	 */
 	atomic_thread_fence_acq();
 	CPU_FOREACH(cpu) {
 		mp->mnt_ref +=
 		    zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu);
 		mp->mnt_lockref +=
 		    zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu);
 		mp->mnt_writeopcount +=
 		    zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu);
 	}
 	MNT_IUNLOCK(mp);
 	vfs_assert_mount_counters(mp);
 }
 
 void
 vfs_op_exit_locked(struct mount *mp)
 {
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	if (mp->mnt_vfs_ops <= 0)
 		panic("%s: invalid vfs_ops count %d for mp %p\n",
 		    __func__, mp->mnt_vfs_ops, mp);
 	mp->mnt_vfs_ops--;
 }
 
 void
 vfs_op_exit(struct mount *mp)
 {
 
 	MNT_ILOCK(mp);
 	vfs_op_exit_locked(mp);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * It is assumed the caller already posted at least an acquire barrier.
  */
 void
 vfs_op_barrier_wait(struct mount *mp)
 {
 	int *in_op;
 	int cpu;
 
 	CPU_FOREACH(cpu) {
 		in_op = zpcpu_get_cpu(mp->mnt_thread_in_ops_pcpu, cpu);
 		while (atomic_load_int(in_op))
 			cpu_spinwait();
 	}
 }
 
 #ifdef DIAGNOSTIC
 void
 vfs_assert_mount_counters(struct mount *mp)
 {
 	int cpu;
 
 	if (mp->mnt_vfs_ops == 0)
 		return;
 
 	CPU_FOREACH(cpu) {
 		if (*(int *)zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 ||
 		    *(int *)zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 ||
 		    *(int *)zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0)
 			vfs_dump_mount_counters(mp);
 	}
 }
 
 void
 vfs_dump_mount_counters(struct mount *mp)
 {
 	int cpu, *count;
 	int ref, lockref, writeopcount;
 
 	printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops);
 
 	printf("        ref : ");
 	ref = mp->mnt_ref;
 	CPU_FOREACH(cpu) {
 		count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu);
 		printf("%d ", *count);
 		ref += *count;
 	}
 	printf("\n");
 	printf("    lockref : ");
 	lockref = mp->mnt_lockref;
 	CPU_FOREACH(cpu) {
 		count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu);
 		printf("%d ", *count);
 		lockref += *count;
 	}
 	printf("\n");
 	printf("writeopcount: ");
 	writeopcount = mp->mnt_writeopcount;
 	CPU_FOREACH(cpu) {
 		count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu);
 		printf("%d ", *count);
 		writeopcount += *count;
 	}
 	printf("\n");
 
 	printf("counter       struct total\n");
 	printf("ref             %-5d  %-5d\n", mp->mnt_ref, ref);
 	printf("lockref         %-5d  %-5d\n", mp->mnt_lockref, lockref);
 	printf("writeopcount    %-5d  %-5d\n", mp->mnt_writeopcount, writeopcount);
 
 	panic("invalid counts on struct mount");
 }
 #endif
 
 int
 vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which)
 {
 	int *base, *pcpu;
 	int cpu, sum;
 
 	switch (which) {
 	case MNT_COUNT_REF:
 		base = &mp->mnt_ref;
 		pcpu = mp->mnt_ref_pcpu;
 		break;
 	case MNT_COUNT_LOCKREF:
 		base = &mp->mnt_lockref;
 		pcpu = mp->mnt_lockref_pcpu;
 		break;
 	case MNT_COUNT_WRITEOPCOUNT:
 		base = &mp->mnt_writeopcount;
 		pcpu = mp->mnt_writeopcount_pcpu;
 		break;
 	}
 
 	sum = *base;
 	CPU_FOREACH(cpu) {
 		sum += *(int *)zpcpu_get_cpu(pcpu, cpu);
 	}
 	return (sum);
 }
 
 /*
  * Do the actual filesystem unmount.
  */
 int
 dounmount(struct mount *mp, int flags, struct thread *td)
 {
 	struct vnode *coveredvp;
 	int error;
 	uint64_t async_flag;
 	int mnt_gen_r;
 
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
 		mnt_gen_r = mp->mnt_gen;
 		VI_LOCK(coveredvp);
 		vholdl(coveredvp);
 		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
 		/*
 		 * Check for mp being unmounted while waiting for the
 		 * covered vnode lock.
 		 */
 		if (coveredvp->v_mountedhere != mp ||
 		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
 			VOP_UNLOCK(coveredvp, 0);
 			vdrop(coveredvp);
 			vfs_rel(mp);
 			return (EBUSY);
 		}
 	}
 
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that did the
 	 * original mount is permitted to unmount this filesystem.
 	 */
 	error = vfs_suser(mp, td);
 	if (error != 0) {
 		if (coveredvp != NULL) {
 			VOP_UNLOCK(coveredvp, 0);
 			vdrop(coveredvp);
 		}
 		vfs_rel(mp);
 		return (error);
 	}
 
 	vfs_op_enter(mp);
 
 	vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
 	    (mp->mnt_flag & MNT_UPDATE) != 0 ||
 	    !TAILQ_EMPTY(&mp->mnt_uppers)) {
 		dounmount_cleanup(mp, coveredvp, 0);
 		return (EBUSY);
 	}
 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
 	if (flags & MNT_NONBUSY) {
 		MNT_IUNLOCK(mp);
 		error = vfs_check_usecounts(mp);
 		MNT_ILOCK(mp);
 		if (error != 0) {
 			dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
 			return (error);
 		}
 	}
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE) {
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 		MNT_IUNLOCK(mp);
 		/*
 		 * Must be done after setting MNTK_UNMOUNTF and before
 		 * waiting for mnt_lockref to become 0.
 		 */
 		VFS_PURGE(mp);
 		MNT_ILOCK(mp);
 	}
 	error = 0;
 	if (mp->mnt_lockref) {
 		mp->mnt_kern_flag |= MNTK_DRAINING;
 		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
 		    "mount drain", 0);
 	}
 	MNT_IUNLOCK(mp);
 	KASSERT(mp->mnt_lockref == 0,
 	    ("%s: invalid lock refcount in the drain path @ %s:%d",
 	    __func__, __FILE__, __LINE__));
 	KASSERT(error == 0,
 	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
 	    __func__, __FILE__, __LINE__));
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	/*
 	 * From now, we can claim that the use reference on the
 	 * coveredvp is ours, and the ref can be released only by
 	 * successfull unmount by us, or left for later unmount
 	 * attempt.  The previously acquired hold reference is no
 	 * longer needed to protect the vnode from reuse.
 	 */
 	if (coveredvp != NULL)
 		vdrop(coveredvp);
 
 	vfs_msync(mp, MNT_WAIT);
 	MNT_ILOCK(mp);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	cache_purgevfs(mp, false); /* remove cache entries for this file sys */
 	vfs_deallocate_syncvnode(mp);
 	if ((mp->mnt_flag & MNT_RDONLY) != 0 || (flags & MNT_FORCE) != 0 ||
 	    (error = VFS_SYNC(mp, MNT_WAIT)) == 0)
 		error = VFS_UNMOUNT(mp, flags);
 	vn_finished_write(mp);
 	/*
 	 * If we failed to flush the dirty blocks for this mount point,
 	 * undo all the cdir/rdir and rootvnode changes we made above.
 	 * Unless we failed to do so because the device is reporting that
 	 * it doesn't exist anymore.
 	 */
 	if (error && error != ENXIO) {
 		MNT_ILOCK(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			MNT_IUNLOCK(mp);
 			vfs_allocate_syncvnode(mp);
 			MNT_ILOCK(mp);
 		}
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		mp->mnt_flag |= async_flag;
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 		    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
 			wakeup(mp);
 		}
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0);
 		return (error);
 	}
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
 	if (coveredvp != NULL) {
 		coveredvp->v_mountedhere = NULL;
 		VOP_UNLOCK(coveredvp, 0);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
 	if (rootvnode != NULL && mp == rootvnode->v_mount) {
 		vrele(rootvnode);
 		rootvnode = NULL;
 	}
 	if (mp == rootdevmp)
 		rootdevmp = NULL;
 	vfs_mount_destroy(mp);
 	return (0);
 }
 
 /*
  * Report errors during filesystem mounting.
  */
 void
 vfs_mount_error(struct mount *mp, const char *fmt, ...)
 {
 	struct vfsoptlist *moptlist = mp->mnt_optnew;
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 void
 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
 {
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Functions for querying mount options/arguments from filesystems.
  */
 
 /*
  * Check that no unknown options are given
  */
 int
 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
 {
 	struct vfsopt *opt;
 	char errmsg[255];
 	const char **t, *p, *q;
 	int ret = 0;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		p = opt->name;
 		q = NULL;
 		if (p[0] == 'n' && p[1] == 'o')
 			q = p + 2;
 		for(t = global_opts; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		for(t = legal; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		snprintf(errmsg, sizeof(errmsg),
 		    "mount option <%s> is unknown", p);
 		ret = EINVAL;
 	}
 	if (ret != 0) {
 		TAILQ_FOREACH(opt, opts, link) {
 			if (strcmp(opt->name, "errmsg") == 0) {
 				strncpy((char *)opt->value, errmsg, opt->len);
 				break;
 			}
 		}
 		if (opt == NULL)
 			printf("%s\n", errmsg);
 	}
 	return (ret);
 }
 
 /*
  * Get a mount option by its name.
  *
  * Return 0 if the option was found, ENOENT otherwise.
  * If len is non-NULL it will be filled with the length
  * of the option. If buf is non-NULL, it will be filled
  * with the address of the option.
  */
 int
 vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len)
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (len != NULL)
 				*len = opt->len;
 			if (buf != NULL)
 				*buf = opt->value;
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 int
 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt;
 
 	if (opts == NULL)
 		return (-1);
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			return (opt->pos);
 		}
 	}
 	return (-1);
 }
 
 int
 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
 {
 	char *opt_value, *vtp;
 	quad_t iv;
 	int error, opt_len;
 
 	error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
 	if (error != 0)
 		return (error);
 	if (opt_len == 0 || opt_value == NULL)
 		return (EINVAL);
 	if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
 		return (EINVAL);
 	iv = strtoq(opt_value, &vtp, 0);
 	if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
 		return (EINVAL);
 	if (iv < 0)
 		return (EINVAL);
 	switch (vtp[0]) {
 	case 't': case 'T':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'g': case 'G':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'm': case 'M':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'k': case 'K':
 		iv *= 1024;
 	case '\0':
 		break;
 	default:
 		return (EINVAL);
 	}
 	*value = iv;
 
 	return (0);
 }
 
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
 	struct vfsopt *opt;
 
 	*error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->len == 0 ||
 		    ((char *)opt->value)[opt->len - 1] != '\0') {
 			*error = EINVAL;
 			return (NULL);
 		}
 		return (opt->value);
 	}
 	*error = ENOENT;
 	return (NULL);
 }
 
 int
 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
 	uint64_t val)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (w != NULL)
 				*w |= val;
 			return (1);
 		}
 	}
 	if (w != NULL)
 		*w &= ~val;
 	return (0);
 }
 
 int
 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct vfsopt *opt;
 	int ret;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->len == 0 || opt->value == NULL)
 			return (0);
 		if (((char *)opt->value)[opt->len - 1] != '\0')
 			return (0);
 		va_start(ap, fmt);
 		ret = vsscanf(opt->value, fmt, ap);
 		va_end(ap);
 		return (ret);
 	}
 	return (0);
 }
 
 int
 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = len;
 		else {
 			if (opt->len != len)
 				return (EINVAL);
 			bcopy(value, opt->value, len);
 		}
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = len;
 		else {
 			if (opt->len < len)
 				return (EINVAL);
 			opt->len = len;
 			bcopy(value, opt->value, len);
 		}
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = strlen(value) + 1;
 		else if (strlcpy(opt->value, value, opt->len) >= opt->len)
 			return (EINVAL);
 		return (0);
 	}
 	return (ENOENT);
 }
 
 /*
  * Find and copy a mount option.
  *
  * The size of the buffer has to be specified
  * in len, if it is not the same length as the
  * mount option, EINVAL is returned.
  * Returns ENOENT if the option is not found.
  */
 int
 vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len)
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (len != opt->len)
 				return (EINVAL);
 			bcopy(opt->value, dest, opt->len);
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 int
 __vfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 
 	/*
 	 * Filesystems only fill in part of the structure for updates, we
 	 * have to read the entirety first to get all content.
 	 */
 	memcpy(sbp, &mp->mnt_stat, sizeof(*sbp));
 
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sbp->f_version = STATFS_VERSION;
 	sbp->f_namemax = NAME_MAX;
 	sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 
 	return (mp->mnt_op->vfs_statfs(mp, sbp));
 }
 
 void
 vfs_mountedfrom(struct mount *mp, const char *from)
 {
 
 	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
 	strlcpy(mp->mnt_stat.f_mntfromname, from,
 	    sizeof mp->mnt_stat.f_mntfromname);
 }
 
 /*
  * ---------------------------------------------------------------------
  * This is the api for building mount args and mounting filesystems from
  * inside the kernel.
  *
  * The API works by accumulation of individual args.  First error is
  * latched.
  *
  * XXX: should be documented in new manpage kernel_mount(9)
  */
 
 /* A memory allocation which must be freed when we are done */
 struct mntaarg {
 	SLIST_ENTRY(mntaarg)	next;
 };
 
 /* The header for the mount arguments */
 struct mntarg {
 	struct iovec *v;
 	int len;
 	int error;
 	SLIST_HEAD(, mntaarg)	list;
 };
 
 /*
  * Add a boolean argument.
  *
  * flag is the boolean value.
  * name must start with "no".
  */
 struct mntarg *
 mount_argb(struct mntarg *ma, int flag, const char *name)
 {
 
 	KASSERT(name[0] == 'n' && name[1] == 'o',
 	    ("mount_argb(...,%s): name must start with 'no'", name));
 
 	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
 }
 
 /*
  * Add an argument printf style
  */
 struct mntarg *
 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct mntaarg *maa;
 	struct sbuf *sb;
 	int len;
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	sb = sbuf_new_auto();
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	len = sbuf_len(sb) + 1;
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	bcopy(sbuf_data(sb), maa + 1, len);
 	sbuf_delete(sb);
 
 	ma->v[ma->len].iov_base = maa + 1;
 	ma->v[ma->len].iov_len = len;
 	ma->len++;
 
 	return (ma);
 }
 
 /*
  * Add an argument which is a userland string.
  */
 struct mntarg *
 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
 {
 	struct mntaarg *maa;
 	char *tbuf;
 
 	if (val == NULL)
 		return (ma);
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	tbuf = (void *)(maa + 1);
 	ma->error = copyinstr(val, tbuf, len, NULL);
 	return (mount_arg(ma, name, tbuf, -1));
 }
 
 /*
  * Plain argument.
  *
  * If length is -1, treat value as a C string.
  */
 struct mntarg *
 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
 {
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
 	if (len < 0)
 		ma->v[ma->len].iov_len = strlen(val) + 1;
 	else
 		ma->v[ma->len].iov_len = len;
 	ma->len++;
 	return (ma);
 }
 
 /*
  * Free a mntarg structure
  */
 static void
 free_mntarg(struct mntarg *ma)
 {
 	struct mntaarg *maa;
 
 	while (!SLIST_EMPTY(&ma->list)) {
 		maa = SLIST_FIRST(&ma->list);
 		SLIST_REMOVE_HEAD(&ma->list, next);
 		free(maa, M_MOUNT);
 	}
 	free(ma->v, M_MOUNT);
 	free(ma, M_MOUNT);
 }
 
 /*
  * Mount a filesystem
  */
 int
 kernel_mount(struct mntarg *ma, uint64_t flags)
 {
 	struct uio auio;
 	int error;
 
 	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
 	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
 	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
 
 	auio.uio_iov = ma->v;
 	auio.uio_iovcnt = ma->len;
 	auio.uio_segflg = UIO_SYSSPACE;
 
 	error = ma->error;
 	if (!error)
 		error = vfs_donmount(curthread, flags, &auio);
 	free_mntarg(ma);
 	return (error);
 }
 
 /*
  * A printflike function to mount a filesystem.
  */
 int
 kernel_vmount(int flags, ...)
 {
 	struct mntarg *ma = NULL;
 	va_list ap;
 	const char *cp;
 	const void *vp;
 	int error;
 
 	va_start(ap, flags);
 	for (;;) {
 		cp = va_arg(ap, const char *);
 		if (cp == NULL)
 			break;
 		vp = va_arg(ap, const void *);
 		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
 	}
 	va_end(ap);
 
 	error = kernel_mount(ma, flags);
 	return (error);
 }
 
 void
 vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
 {
 
 	bcopy(oexp, exp, sizeof(*oexp));
 	exp->ex_numsecflavors = 0;
 }
Index: projects/clang900-import/sys/netinet/sctp_asconf.c
===================================================================
--- projects/clang900-import/sys/netinet/sctp_asconf.c	(revision 352586)
+++ projects/clang900-import/sys/netinet/sctp_asconf.c	(revision 352587)
@@ -1,3482 +1,3483 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_timer.h>
 
 /*
  * debug flags:
  * SCTP_DEBUG_ASCONF1: protocol info, general info and errors
  * SCTP_DEBUG_ASCONF2: detailed info
  */
 
 
 /*
  * RFC 5061
  *
  * An ASCONF parameter queue exists per asoc which holds the pending address
  * operations.  Lists are updated upon receipt of ASCONF-ACK.
  *
  * A restricted_addrs list exists per assoc to hold local addresses that are
  * not (yet) usable by the assoc as a source address.  These addresses are
  * either pending an ASCONF operation (and exist on the ASCONF parameter
  * queue), or they are permanently restricted (the peer has returned an
  * ERROR indication to an ASCONF(ADD), or the peer does not support ASCONF).
  *
  * Deleted addresses are always immediately removed from the lists as they will
  * (shortly) no longer exist in the kernel.  We send ASCONFs as a courtesy,
  * only if allowed.
  */
 
 /*
  * ASCONF parameter processing.
  * response_required: set if a reply is required (eg. SUCCESS_REPORT).
  * returns a mbuf to an "error" response parameter or NULL/"success" if ok.
  * FIX: allocating this many mbufs on the fly is pretty inefficient...
  */
 static struct mbuf *
 sctp_asconf_success_response(uint32_t id)
 {
 	struct mbuf *m_reply = NULL;
 	struct sctp_asconf_paramhdr *aph;
 
 	m_reply = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_paramhdr),
 	    0, M_NOWAIT, 1, MT_DATA);
 	if (m_reply == NULL) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "asconf_success_response: couldn't get mbuf!\n");
 		return (NULL);
 	}
 	aph = mtod(m_reply, struct sctp_asconf_paramhdr *);
 	aph->correlation_id = id;
 	aph->ph.param_type = htons(SCTP_SUCCESS_REPORT);
 	aph->ph.param_length = sizeof(struct sctp_asconf_paramhdr);
 	SCTP_BUF_LEN(m_reply) = aph->ph.param_length;
 	aph->ph.param_length = htons(aph->ph.param_length);
 
 	return (m_reply);
 }
 
 static struct mbuf *
 sctp_asconf_error_response(uint32_t id, uint16_t cause, uint8_t *error_tlv,
     uint16_t tlv_length)
 {
 	struct mbuf *m_reply = NULL;
 	struct sctp_asconf_paramhdr *aph;
 	struct sctp_error_cause *error;
 	uint8_t *tlv;
 
 	m_reply = sctp_get_mbuf_for_msg((sizeof(struct sctp_asconf_paramhdr) +
 	    tlv_length +
 	    sizeof(struct sctp_error_cause)),
 	    0, M_NOWAIT, 1, MT_DATA);
 	if (m_reply == NULL) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "asconf_error_response: couldn't get mbuf!\n");
 		return (NULL);
 	}
 	aph = mtod(m_reply, struct sctp_asconf_paramhdr *);
 	error = (struct sctp_error_cause *)(aph + 1);
 
 	aph->correlation_id = id;
 	aph->ph.param_type = htons(SCTP_ERROR_CAUSE_IND);
 	error->code = htons(cause);
 	error->length = tlv_length + sizeof(struct sctp_error_cause);
 	aph->ph.param_length = error->length +
 	    sizeof(struct sctp_asconf_paramhdr);
 
 	if (aph->ph.param_length > MLEN) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "asconf_error_response: tlv_length (%xh) too big\n",
 		    tlv_length);
 		sctp_m_freem(m_reply);	/* discard */
 		return (NULL);
 	}
 	if (error_tlv != NULL) {
 		tlv = (uint8_t *)(error + 1);
 		memcpy(tlv, error_tlv, tlv_length);
 	}
 	SCTP_BUF_LEN(m_reply) = aph->ph.param_length;
 	error->length = htons(error->length);
 	aph->ph.param_length = htons(aph->ph.param_length);
 
 	return (m_reply);
 }
 
 static struct mbuf *
 sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int send_hb, int response_required)
 {
 	struct sctp_nets *net;
 	struct mbuf *m_reply = NULL;
 	union sctp_sockstore store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, aparam_length;
 #if defined(INET) || defined(INET6)
 	uint16_t param_length;
 #endif
 	struct sockaddr *sa;
 	int zero_address = 0;
 	int bad_address = 0;
 #ifdef INET
 	struct sockaddr_in *sin;
 	struct sctp_ipv4addr_param *v4addr;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	struct sctp_ipv6addr_param *v6addr;
 #endif
 
 	aparam_length = ntohs(aph->ph.param_length);
 	ph = (struct sctp_paramhdr *)(aph + 1);
 	param_type = ntohs(ph->param_type);
 #if defined(INET) || defined(INET6)
 	param_length = ntohs(ph->param_length);
 #endif
 	sa = &store.sa;
 	switch (param_type) {
 #ifdef INET
 	case SCTP_IPV4_ADDRESS:
 		if (param_length != sizeof(struct sctp_ipv4addr_param)) {
 			/* invalid param size */
 			return (NULL);
 		}
 		v4addr = (struct sctp_ipv4addr_param *)ph;
 		sin = &store.sin;
 		memset(sin, 0, sizeof(*sin));
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(struct sockaddr_in);
 		sin->sin_port = stcb->rport;
 		sin->sin_addr.s_addr = v4addr->addr;
 		if ((sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 		    IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			bad_address = 1;
 		}
 		if (sin->sin_addr.s_addr == INADDR_ANY)
 			zero_address = 1;
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: adding ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
 		break;
 #endif
 #ifdef INET6
 	case SCTP_IPV6_ADDRESS:
 		if (param_length != sizeof(struct sctp_ipv6addr_param)) {
 			/* invalid param size */
 			return (NULL);
 		}
 		v6addr = (struct sctp_ipv6addr_param *)ph;
 		sin6 = &store.sin6;
 		memset(sin6, 0, sizeof(*sin6));
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_len = sizeof(struct sockaddr_in6);
 		sin6->sin6_port = stcb->rport;
 		memcpy((caddr_t)&sin6->sin6_addr, v6addr->addr,
 		    sizeof(struct in6_addr));
 		if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 			bad_address = 1;
 		}
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			zero_address = 1;
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: adding ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
 		break;
 #endif
 	default:
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
 		    SCTP_CAUSE_INVALID_PARAM, (uint8_t *)aph,
 		    aparam_length);
 		return (m_reply);
 	}			/* end switch */
 
 	/* if 0.0.0.0/::0, add the source address instead */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
 		sa = src;
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_add_ip: using source addr ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
 	}
 	/* add the address */
 	if (bad_address) {
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
 		    SCTP_CAUSE_INVALID_PARAM, (uint8_t *)aph,
 		    aparam_length);
 	} else if (sctp_add_remote_addr(stcb, sa, &net, stcb->asoc.port,
 		    SCTP_DONOT_SETSCOPE,
 	    SCTP_ADDR_DYNAMIC_ADDED) != 0) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_add_ip: error adding address\n");
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
 		    SCTP_CAUSE_RESOURCE_SHORTAGE, (uint8_t *)aph,
 		    aparam_length);
 	} else {
 		/* notify upper layer */
 		sctp_ulp_notify(SCTP_NOTIFY_ASCONF_ADD_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED);
 		if (response_required) {
 			m_reply =
 			    sctp_asconf_success_response(aph->correlation_id);
 		}
 		sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, net);
 		sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep,
 		    stcb, net);
 		if (send_hb) {
 			sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED);
 		}
 	}
 	return (m_reply);
 }
 
 static int
 sctp_asconf_del_remote_addrs_except(struct sctp_tcb *stcb, struct sockaddr *src)
 {
 	struct sctp_nets *src_net, *net;
 
 	/* make sure the source address exists as a destination net */
 	src_net = sctp_findnet(stcb, src);
 	if (src_net == NULL) {
 		/* not found */
 		return (-1);
 	}
 
 	/* delete all destination addresses except the source */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		if (net != src_net) {
 			/* delete this address */
 			sctp_remove_net(stcb, net);
 			SCTPDBG(SCTP_DEBUG_ASCONF1,
 			    "asconf_del_remote_addrs_except: deleting ");
 			SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1,
 			    (struct sockaddr *)&net->ro._l_addr);
 			/* notify upper layer */
 			sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0,
 			    (struct sockaddr *)&net->ro._l_addr, SCTP_SO_NOT_LOCKED);
 		}
 	}
 	return (0);
 }
 
 static struct mbuf *
 sctp_process_asconf_delete_ip(struct sockaddr *src,
     struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int response_required)
 {
 	struct mbuf *m_reply = NULL;
 	union sctp_sockstore store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, aparam_length;
 #if defined(INET) || defined(INET6)
 	uint16_t param_length;
 #endif
 	struct sockaddr *sa;
 	int zero_address = 0;
 	int result;
 #ifdef INET
 	struct sockaddr_in *sin;
 	struct sctp_ipv4addr_param *v4addr;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	struct sctp_ipv6addr_param *v6addr;
 #endif
 
 	aparam_length = ntohs(aph->ph.param_length);
 	ph = (struct sctp_paramhdr *)(aph + 1);
 	param_type = ntohs(ph->param_type);
 #if defined(INET) || defined(INET6)
 	param_length = ntohs(ph->param_length);
 #endif
 	sa = &store.sa;
 	switch (param_type) {
 #ifdef INET
 	case SCTP_IPV4_ADDRESS:
 		if (param_length != sizeof(struct sctp_ipv4addr_param)) {
 			/* invalid param size */
 			return (NULL);
 		}
 		v4addr = (struct sctp_ipv4addr_param *)ph;
 		sin = &store.sin;
 		memset(sin, 0, sizeof(*sin));
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(struct sockaddr_in);
 		sin->sin_port = stcb->rport;
 		sin->sin_addr.s_addr = v4addr->addr;
 		if (sin->sin_addr.s_addr == INADDR_ANY)
 			zero_address = 1;
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_delete_ip: deleting ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
 		break;
 #endif
 #ifdef INET6
 	case SCTP_IPV6_ADDRESS:
 		if (param_length != sizeof(struct sctp_ipv6addr_param)) {
 			/* invalid param size */
 			return (NULL);
 		}
 		v6addr = (struct sctp_ipv6addr_param *)ph;
 		sin6 = &store.sin6;
 		memset(sin6, 0, sizeof(*sin6));
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_len = sizeof(struct sockaddr_in6);
 		sin6->sin6_port = stcb->rport;
 		memcpy(&sin6->sin6_addr, v6addr->addr,
 		    sizeof(struct in6_addr));
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			zero_address = 1;
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_delete_ip: deleting ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
 		break;
 #endif
 	default:
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
 		    SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *)aph,
 		    aparam_length);
 		return (m_reply);
 	}
 
 	/* make sure the source address is not being deleted */
 	if (sctp_cmpaddr(sa, src)) {
 		/* trying to delete the source address! */
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete source addr\n");
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
 		    SCTP_CAUSE_DELETING_SRC_ADDR, (uint8_t *)aph,
 		    aparam_length);
 		return (m_reply);
 	}
 
 	/* if deleting 0.0.0.0/::0, delete all addresses except src addr */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
 		result = sctp_asconf_del_remote_addrs_except(stcb, src);
 
 		if (result) {
 			/* src address did not exist? */
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: src addr does not exist?\n");
 			/* what error to reply with?? */
 			m_reply =
 			    sctp_asconf_error_response(aph->correlation_id,
 			    SCTP_CAUSE_REQUEST_REFUSED, (uint8_t *)aph,
 			    aparam_length);
 		} else if (response_required) {
 			m_reply =
 			    sctp_asconf_success_response(aph->correlation_id);
 		}
 		return (m_reply);
 	}
 
 	/* delete the address */
 	result = sctp_del_remote_addr(stcb, sa);
 	/*
 	 * note if result == -2, the address doesn't exist in the asoc but
 	 * since it's being deleted anyways, we just ack the delete -- but
 	 * this probably means something has already gone awry
 	 */
 	if (result == -1) {
 		/* only one address in the asoc */
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete last IP addr!\n");
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
 		    SCTP_CAUSE_DELETING_LAST_ADDR, (uint8_t *)aph,
 		    aparam_length);
 	} else {
 		if (response_required) {
 			m_reply = sctp_asconf_success_response(aph->correlation_id);
 		}
 		/* notify upper layer */
 		sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED);
 	}
 	return (m_reply);
 }
 
 static struct mbuf *
 sctp_process_asconf_set_primary(struct sockaddr *src,
     struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int response_required)
 {
 	struct mbuf *m_reply = NULL;
 	union sctp_sockstore store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, aparam_length;
 #if defined(INET) || defined(INET6)
 	uint16_t param_length;
 #endif
 	struct sockaddr *sa;
 	int zero_address = 0;
 #ifdef INET
 	struct sockaddr_in *sin;
 	struct sctp_ipv4addr_param *v4addr;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	struct sctp_ipv6addr_param *v6addr;
 #endif
 
 	aparam_length = ntohs(aph->ph.param_length);
 	ph = (struct sctp_paramhdr *)(aph + 1);
 	param_type = ntohs(ph->param_type);
 #if defined(INET) || defined(INET6)
 	param_length = ntohs(ph->param_length);
 #endif
 	sa = &store.sa;
 	switch (param_type) {
 #ifdef INET
 	case SCTP_IPV4_ADDRESS:
 		if (param_length != sizeof(struct sctp_ipv4addr_param)) {
 			/* invalid param size */
 			return (NULL);
 		}
 		v4addr = (struct sctp_ipv4addr_param *)ph;
 		sin = &store.sin;
 		memset(sin, 0, sizeof(*sin));
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(struct sockaddr_in);
 		sin->sin_addr.s_addr = v4addr->addr;
 		if (sin->sin_addr.s_addr == INADDR_ANY)
 			zero_address = 1;
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
 		break;
 #endif
 #ifdef INET6
 	case SCTP_IPV6_ADDRESS:
 		if (param_length != sizeof(struct sctp_ipv6addr_param)) {
 			/* invalid param size */
 			return (NULL);
 		}
 		v6addr = (struct sctp_ipv6addr_param *)ph;
 		sin6 = &store.sin6;
 		memset(sin6, 0, sizeof(*sin6));
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_len = sizeof(struct sockaddr_in6);
 		memcpy((caddr_t)&sin6->sin6_addr, v6addr->addr,
 		    sizeof(struct in6_addr));
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			zero_address = 1;
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
 		break;
 #endif
 	default:
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
 		    SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *)aph,
 		    aparam_length);
 		return (m_reply);
 	}
 
 	/* if 0.0.0.0/::0, use the source address instead */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
 		sa = src;
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_set_primary: using source addr ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
 	}
 	/* set the primary address */
 	if (sctp_set_primary_addr(stcb, sa, NULL) == 0) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_set_primary: primary address set\n");
 		/* notify upper layer */
 		sctp_ulp_notify(SCTP_NOTIFY_ASCONF_SET_PRIMARY, stcb, 0, sa, SCTP_SO_NOT_LOCKED);
 		if ((stcb->asoc.primary_destination->dest_state & SCTP_ADDR_REACHABLE) &&
 		    (!(stcb->asoc.primary_destination->dest_state & SCTP_ADDR_PF)) &&
 		    (stcb->asoc.alternate)) {
 			sctp_free_remote_addr(stcb->asoc.alternate);
 			stcb->asoc.alternate = NULL;
 		}
 		if (response_required) {
 			m_reply = sctp_asconf_success_response(aph->correlation_id);
 		}
 		/*
 		 * Mobility adaptation. Ideally, when the reception of SET
 		 * PRIMARY with DELETE IP ADDRESS of the previous primary
 		 * destination, unacknowledged DATA are retransmitted
 		 * immediately to the new primary destination for seamless
 		 * handover. If the destination is UNCONFIRMED and marked to
 		 * REQ_PRIM, The retransmission occur when reception of the
 		 * HEARTBEAT-ACK.  (See sctp_handle_heartbeat_ack in
 		 * sctp_input.c) Also, when change of the primary
 		 * destination, it is better that all subsequent new DATA
 		 * containing already queued DATA are transmitted to the new
 		 * primary destination. (by micchie)
 		 */
 		if ((sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_BASE) ||
 		    sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_FASTHANDOFF)) &&
 		    sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_PRIM_DELETED) &&
 		    (stcb->asoc.primary_destination->dest_state &
 		    SCTP_ADDR_UNCONFIRMED) == 0) {
 
 			sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED,
 			    stcb->sctp_ep, stcb, NULL,
 			    SCTP_FROM_SCTP_ASCONF + SCTP_LOC_1);
 			if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 			    SCTP_MOBILITY_FASTHANDOFF)) {
 				sctp_assoc_immediate_retrans(stcb,
 				    stcb->asoc.primary_destination);
 			}
 			if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 			    SCTP_MOBILITY_BASE)) {
 				sctp_move_chunks_from_net(stcb,
 				    stcb->asoc.deleted_primary);
 			}
 			sctp_delete_prim_timer(stcb->sctp_ep, stcb,
 			    stcb->asoc.deleted_primary);
 		}
 	} else {
 		/* couldn't set the requested primary address! */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_set_primary: set primary failed!\n");
 		/* must have been an invalid address, so report */
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
 		    SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *)aph,
 		    aparam_length);
 	}
 
 	return (m_reply);
 }
 
 /*
  * handles an ASCONF chunk.
  * if all parameters are processed ok, send a plain (empty) ASCONF-ACK
  */
 void
 sctp_handle_asconf(struct mbuf *m, unsigned int offset,
     struct sockaddr *src,
     struct sctp_asconf_chunk *cp, struct sctp_tcb *stcb,
     int first)
 {
 	struct sctp_association *asoc;
 	uint32_t serial_num;
 	struct mbuf *n, *m_ack, *m_result, *m_tail;
 	struct sctp_asconf_ack_chunk *ack_cp;
 	struct sctp_asconf_paramhdr *aph;
 	struct sctp_ipv6addr_param *p_addr;
 	unsigned int asconf_limit, cnt;
 	int error = 0;		/* did an error occur? */
 
 	/* asconf param buffer */
 	uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_asconf_ack *ack, *ack_next;
 
 	/* verify minimum length */
 	if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_asconf_chunk)) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "handle_asconf: chunk too small = %xh\n",
 		    ntohs(cp->ch.chunk_length));
 		return;
 	}
 	asoc = &stcb->asoc;
 	serial_num = ntohl(cp->serial_number);
 
 	if (SCTP_TSN_GE(asoc->asconf_seq_in, serial_num)) {
 		/* got a duplicate ASCONF */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "handle_asconf: got duplicate serial number = %xh\n",
 		    serial_num);
 		return;
 	} else if (serial_num != (asoc->asconf_seq_in + 1)) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: incorrect serial number = %xh (expected next = %xh)\n",
 		    serial_num, asoc->asconf_seq_in + 1);
 		return;
 	}
 
 	/* it's the expected "next" sequence number, so process it */
 	asoc->asconf_seq_in = serial_num;	/* update sequence */
 	/* get length of all the param's in the ASCONF */
 	asconf_limit = offset + ntohs(cp->ch.chunk_length);
 	SCTPDBG(SCTP_DEBUG_ASCONF1,
 	    "handle_asconf: asconf_limit=%u, sequence=%xh\n",
 	    asconf_limit, serial_num);
 
 	if (first) {
 		/* delete old cache */
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: Now processing first ASCONF. Try to delete old cache\n");
 
 		TAILQ_FOREACH_SAFE(ack, &asoc->asconf_ack_sent, next, ack_next) {
 			if (ack->serial_number == serial_num)
 				break;
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: delete old(%u) < first(%u)\n",
 			    ack->serial_number, serial_num);
 			TAILQ_REMOVE(&asoc->asconf_ack_sent, ack, next);
 			if (ack->data != NULL) {
 				sctp_m_freem(ack->data);
 			}
 			SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), ack);
 		}
 	}
 
 	m_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_ack_chunk), 0,
 	    M_NOWAIT, 1, MT_DATA);
 	if (m_ack == NULL) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "handle_asconf: couldn't get mbuf!\n");
 		return;
 	}
 	m_tail = m_ack;		/* current reply chain's tail */
 
 	/* fill in ASCONF-ACK header */
 	ack_cp = mtod(m_ack, struct sctp_asconf_ack_chunk *);
 	ack_cp->ch.chunk_type = SCTP_ASCONF_ACK;
 	ack_cp->ch.chunk_flags = 0;
 	ack_cp->serial_number = htonl(serial_num);
 	/* set initial lengths (eg. just an ASCONF-ACK), ntohx at the end! */
 	SCTP_BUF_LEN(m_ack) = sizeof(struct sctp_asconf_ack_chunk);
 	ack_cp->ch.chunk_length = sizeof(struct sctp_asconf_ack_chunk);
 
 	/* skip the lookup address parameter */
 	offset += sizeof(struct sctp_asconf_chunk);
 	p_addr = (struct sctp_ipv6addr_param *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *)&aparam_buf);
 	if (p_addr == NULL) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "handle_asconf: couldn't get lookup addr!\n");
 		/* respond with a missing/invalid mandatory parameter error */
 		sctp_m_freem(m_ack);
 		return;
 	}
 	/* param_length is already validated in process_control... */
 	offset += ntohs(p_addr->ph.param_length);	/* skip lookup addr */
 	/* get pointer to first asconf param in ASCONF */
 	aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), (uint8_t *)&aparam_buf);
 	if (aph == NULL) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "Empty ASCONF received?\n");
 		goto send_reply;
 	}
 	/* process through all parameters */
 	cnt = 0;
 	while (aph != NULL) {
 		unsigned int param_length, param_type;
 
 		param_type = ntohs(aph->ph.param_type);
 		param_length = ntohs(aph->ph.param_length);
 		if (offset + param_length > asconf_limit) {
 			/* parameter goes beyond end of chunk! */
 			sctp_m_freem(m_ack);
 			return;
 		}
 		m_result = NULL;
 
 		if (param_length > sizeof(aparam_buf)) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) larger than buffer size!\n", param_length);
 			sctp_m_freem(m_ack);
 			return;
 		}
 		if (param_length <= sizeof(struct sctp_paramhdr)) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) too short\n", param_length);
 			sctp_m_freem(m_ack);
+			return;
 		}
 		/* get the entire parameter */
 		aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf);
 		if (aph == NULL) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get entire param\n");
 			sctp_m_freem(m_ack);
 			return;
 		}
 		switch (param_type) {
 		case SCTP_ADD_IP_ADDRESS:
 			m_result = sctp_process_asconf_add_ip(src, aph, stcb,
 			    (cnt < SCTP_BASE_SYSCTL(sctp_hb_maxburst)), error);
 			cnt++;
 			break;
 		case SCTP_DEL_IP_ADDRESS:
 			m_result = sctp_process_asconf_delete_ip(src, aph, stcb,
 			    error);
 			break;
 		case SCTP_ERROR_CAUSE_IND:
 			/* not valid in an ASCONF chunk */
 			break;
 		case SCTP_SET_PRIM_ADDR:
 			m_result = sctp_process_asconf_set_primary(src, aph,
 			    stcb, error);
 			break;
 		case SCTP_NAT_VTAGS:
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: sees a NAT VTAG state parameter\n");
 			break;
 		case SCTP_SUCCESS_REPORT:
 			/* not valid in an ASCONF chunk */
 			break;
 		case SCTP_ULP_ADAPTATION:
 			/* FIX */
 			break;
 		default:
 			if ((param_type & 0x8000) == 0) {
 				/* Been told to STOP at this param */
 				asconf_limit = offset;
 				/*
 				 * FIX FIX - We need to call
 				 * sctp_arethere_unrecognized_parameters()
 				 * to get a operr and send it for any
 				 * param's with the 0x4000 bit set OR do it
 				 * here ourselves... note we still must STOP
 				 * if the 0x8000 bit is clear.
 				 */
 			}
 			/* unknown/invalid param type */
 			break;
 		}		/* switch */
 
 		/* add any (error) result to the reply mbuf chain */
 		if (m_result != NULL) {
 			SCTP_BUF_NEXT(m_tail) = m_result;
 			m_tail = m_result;
 			/* update lengths, make sure it's aligned too */
 			SCTP_BUF_LEN(m_result) = SCTP_SIZE32(SCTP_BUF_LEN(m_result));
 			ack_cp->ch.chunk_length += SCTP_BUF_LEN(m_result);
 			/* set flag to force success reports */
 			error = 1;
 		}
 		offset += SCTP_SIZE32(param_length);
 		/* update remaining ASCONF message length to process */
 		if (offset >= asconf_limit) {
 			/* no more data in the mbuf chain */
 			break;
 		}
 		/* get pointer to next asconf param */
 		aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset,
 		    sizeof(struct sctp_asconf_paramhdr),
 		    (uint8_t *)&aparam_buf);
 		if (aph == NULL) {
 			/* can't get an asconf paramhdr */
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: can't get asconf param hdr!\n");
 			/* FIX ME - add error here... */
 		}
 	}
 
 send_reply:
 	ack_cp->ch.chunk_length = htons(ack_cp->ch.chunk_length);
 	/* save the ASCONF-ACK reply */
 	ack = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asconf_ack),
 	    struct sctp_asconf_ack);
 	if (ack == NULL) {
 		sctp_m_freem(m_ack);
 		return;
 	}
 	ack->serial_number = serial_num;
 	ack->last_sent_to = NULL;
 	ack->data = m_ack;
 	ack->len = 0;
 	for (n = m_ack; n != NULL; n = SCTP_BUF_NEXT(n)) {
 		ack->len += SCTP_BUF_LEN(n);
 	}
 	TAILQ_INSERT_TAIL(&stcb->asoc.asconf_ack_sent, ack, next);
 
 	/* see if last_control_chunk_from is set properly (use IP src addr) */
 	if (stcb->asoc.last_control_chunk_from == NULL) {
 		/*
 		 * this could happen if the source address was just newly
 		 * added
 		 */
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: looking up net for IP source address\n");
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "Looking for IP source: ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
 		/* look up the from address */
 		stcb->asoc.last_control_chunk_from = sctp_findnet(stcb, src);
 #ifdef SCTP_DEBUG
 		if (stcb->asoc.last_control_chunk_from == NULL) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: IP source address not found?!\n");
 		}
 #endif
 	}
 }
 
 /*
  * does the address match? returns 0 if not, 1 if so
  */
 static uint32_t
 sctp_asconf_addr_match(struct sctp_asconf_addr *aa, struct sockaddr *sa)
 {
 	switch (sa->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			/* XXX scopeid */
 			struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
 
 			if ((aa->ap.addrp.ph.param_type == SCTP_IPV6_ADDRESS) &&
 			    (memcmp(&aa->ap.addrp.addr, &sin6->sin6_addr,
 			    sizeof(struct in6_addr)) == 0)) {
 				return (1);
 			}
 			break;
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin = (struct sockaddr_in *)sa;
 
 			if ((aa->ap.addrp.ph.param_type == SCTP_IPV4_ADDRESS) &&
 			    (memcmp(&aa->ap.addrp.addr, &sin->sin_addr,
 			    sizeof(struct in_addr)) == 0)) {
 				return (1);
 			}
 			break;
 		}
 #endif
 	default:
 		break;
 	}
 	return (0);
 }
 
 /*
  * does the address match? returns 0 if not, 1 if so
  */
 static uint32_t
 sctp_addr_match(struct sctp_paramhdr *ph, struct sockaddr *sa)
 {
 #if defined(INET) || defined(INET6)
 	uint16_t param_type, param_length;
 
 	param_type = ntohs(ph->param_type);
 	param_length = ntohs(ph->param_length);
 #endif
 	switch (sa->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			/* XXX scopeid */
 			struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
 			struct sctp_ipv6addr_param *v6addr;
 
 			v6addr = (struct sctp_ipv6addr_param *)ph;
 			if ((param_type == SCTP_IPV6_ADDRESS) &&
 			    (param_length == sizeof(struct sctp_ipv6addr_param)) &&
 			    (memcmp(&v6addr->addr, &sin6->sin6_addr,
 			    sizeof(struct in6_addr)) == 0)) {
 				return (1);
 			}
 			break;
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin = (struct sockaddr_in *)sa;
 			struct sctp_ipv4addr_param *v4addr;
 
 			v4addr = (struct sctp_ipv4addr_param *)ph;
 			if ((param_type == SCTP_IPV4_ADDRESS) &&
 			    (param_length == sizeof(struct sctp_ipv4addr_param)) &&
 			    (memcmp(&v4addr->addr, &sin->sin_addr,
 			    sizeof(struct in_addr)) == 0)) {
 				return (1);
 			}
 			break;
 		}
 #endif
 	default:
 		break;
 	}
 	return (0);
 }
 
 /*
  * Cleanup for non-responded/OP ERR'd ASCONF
  */
 void
 sctp_asconf_cleanup(struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	/*
 	 * clear out any existing asconfs going out
 	 */
 	sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net,
 	    SCTP_FROM_SCTP_ASCONF + SCTP_LOC_2);
 	stcb->asoc.asconf_seq_out_acked = stcb->asoc.asconf_seq_out;
 	/* remove the old ASCONF on our outbound queue */
 	sctp_toss_old_asconf(stcb);
 }
 
 /*
  * cleanup any cached source addresses that may be topologically
  * incorrect after a new address has been added to this interface.
  */
 static void
 sctp_asconf_nets_cleanup(struct sctp_tcb *stcb, struct sctp_ifn *ifn)
 {
 	struct sctp_nets *net;
 
 	/*
 	 * Ideally, we want to only clear cached routes and source addresses
 	 * that are topologically incorrect.  But since there is no easy way
 	 * to know whether the newly added address on the ifn would cause a
 	 * routing change (i.e. a new egress interface would be chosen)
 	 * without doing a new routing lookup and source address selection,
 	 * we will (for now) just flush any cached route using a different
 	 * ifn (and cached source addrs) and let output re-choose them
 	 * during the next send on that net.
 	 */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		/*
 		 * clear any cached route (and cached source address) if the
 		 * route's interface is NOT the same as the address change.
 		 * If it's the same interface, just clear the cached source
 		 * address.
 		 */
 		if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro) &&
 		    ((ifn == NULL) ||
 		    (SCTP_GET_IF_INDEX_FROM_ROUTE(&net->ro) != ifn->ifn_index))) {
 			/* clear any cached route */
 			RTFREE(net->ro.ro_rt);
 			net->ro.ro_rt = NULL;
 		}
 		/* clear any cached source address */
 		if (net->src_addr_selected) {
 			sctp_free_ifa(net->ro._s_addr);
 			net->ro._s_addr = NULL;
 			net->src_addr_selected = 0;
 		}
 	}
 }
 
 
 void
 sctp_assoc_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *dstnet)
 {
 	int error;
 
 	if (dstnet->dest_state & SCTP_ADDR_UNCONFIRMED) {
 		return;
 	}
 	if (stcb->asoc.deleted_primary == NULL) {
 		return;
 	}
 
 	if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "assoc_immediate_retrans: Deleted primary is ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.deleted_primary->ro._l_addr.sa);
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "Current Primary is ");
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.primary_destination->ro._l_addr.sa);
 		sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb,
 		    stcb->asoc.deleted_primary,
 		    SCTP_FROM_SCTP_ASCONF + SCTP_LOC_3);
 		stcb->asoc.num_send_timers_up--;
 		if (stcb->asoc.num_send_timers_up < 0) {
 			stcb->asoc.num_send_timers_up = 0;
 		}
 		SCTP_TCB_LOCK_ASSERT(stcb);
 		error = sctp_t3rxt_timer(stcb->sctp_ep, stcb,
 		    stcb->asoc.deleted_primary);
 		if (error) {
 			SCTP_INP_DECR_REF(stcb->sctp_ep);
 			return;
 		}
 		SCTP_TCB_LOCK_ASSERT(stcb);
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_auditing(4, stcb->sctp_ep, stcb, stcb->asoc.deleted_primary);
 #endif
 		sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
 		if ((stcb->asoc.num_send_timers_up == 0) &&
 		    (stcb->asoc.sent_queue_cnt > 0)) {
 			struct sctp_tmit_chunk *chk;
 
 			chk = TAILQ_FIRST(&stcb->asoc.sent_queue);
 			sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
 			    stcb, chk->whoTo);
 		}
 	}
 	return;
 }
 
 static int
     sctp_asconf_queue_mgmt(struct sctp_tcb *, struct sctp_ifa *, uint16_t);
 
 void
 sctp_net_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	struct sctp_tmit_chunk *chk;
 
 	SCTPDBG(SCTP_DEBUG_ASCONF1, "net_immediate_retrans: RTO is %d\n", net->RTO);
 	sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net,
 	    SCTP_FROM_SCTP_ASCONF + SCTP_LOC_4);
 	stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
 	net->error_count = 0;
 	TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
 		if (chk->whoTo == net) {
 			if (chk->sent < SCTP_DATAGRAM_RESEND) {
 				chk->sent = SCTP_DATAGRAM_RESEND;
 				sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
 				sctp_flight_size_decrease(chk);
 				sctp_total_flight_decrease(stcb, chk);
 				net->marked_retrans++;
 				stcb->asoc.marked_retrans++;
 			}
 		}
 	}
 	if (net->marked_retrans) {
 		sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
 	}
 }
 
 static void
 sctp_path_check_and_react(struct sctp_tcb *stcb, struct sctp_ifa *newifa)
 {
 	struct sctp_nets *net;
 	int addrnum, changed;
 
 	/*
 	 * If number of local valid addresses is 1, the valid address is
 	 * probably newly added address. Several valid addresses in this
 	 * association.  A source address may not be changed.  Additionally,
 	 * they can be configured on a same interface as "alias" addresses.
 	 * (by micchie)
 	 */
 	addrnum = sctp_local_addr_count(stcb);
 	SCTPDBG(SCTP_DEBUG_ASCONF1, "p_check_react(): %d local addresses\n",
 	    addrnum);
 	if (addrnum == 1) {
 		TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 			/* clear any cached route and source address */
 			if (net->ro.ro_rt) {
 				RTFREE(net->ro.ro_rt);
 				net->ro.ro_rt = NULL;
 			}
 			if (net->src_addr_selected) {
 				sctp_free_ifa(net->ro._s_addr);
 				net->ro._s_addr = NULL;
 				net->src_addr_selected = 0;
 			}
 			/* Retransmit unacknowledged DATA chunks immediately */
 			if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 			    SCTP_MOBILITY_FASTHANDOFF)) {
 				sctp_net_immediate_retrans(stcb, net);
 			}
 			/* also, SET PRIMARY is maybe already sent */
 		}
 		return;
 	}
 
 	/* Multiple local addresses exsist in the association.  */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		/* clear any cached route and source address */
 		if (net->ro.ro_rt) {
 			RTFREE(net->ro.ro_rt);
 			net->ro.ro_rt = NULL;
 		}
 		if (net->src_addr_selected) {
 			sctp_free_ifa(net->ro._s_addr);
 			net->ro._s_addr = NULL;
 			net->src_addr_selected = 0;
 		}
 		/*
 		 * Check if the nexthop is corresponding to the new address.
 		 * If the new address is corresponding to the current
 		 * nexthop, the path will be changed. If the new address is
 		 * NOT corresponding to the current nexthop, the path will
 		 * not be changed.
 		 */
 		SCTP_RTALLOC((sctp_route_t *)&net->ro,
 		    stcb->sctp_ep->def_vrf_id,
 		    stcb->sctp_ep->fibnum);
 		if (net->ro.ro_rt == NULL)
 			continue;
 
 		changed = 0;
 		switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (sctp_v4src_match_nexthop(newifa, (sctp_route_t *)&net->ro)) {
 				changed = 1;
 			}
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (sctp_v6src_match_nexthop(
 			    &newifa->address.sin6, (sctp_route_t *)&net->ro)) {
 				changed = 1;
 			}
 			break;
 #endif
 		default:
 			break;
 		}
 		/*
 		 * if the newly added address does not relate routing
 		 * information, we skip.
 		 */
 		if (changed == 0)
 			continue;
 		/* Retransmit unacknowledged DATA chunks immediately */
 		if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_FASTHANDOFF)) {
 			sctp_net_immediate_retrans(stcb, net);
 		}
 		/* Send SET PRIMARY for this new address */
 		if (net == stcb->asoc.primary_destination) {
 			(void)sctp_asconf_queue_mgmt(stcb, newifa,
 			    SCTP_SET_PRIM_ADDR);
 		}
 	}
 }
 
 /*
  * process an ADD/DELETE IP ack from peer.
  * addr: corresponding sctp_ifa to the address being added/deleted.
  * type: SCTP_ADD_IP_ADDRESS or SCTP_DEL_IP_ADDRESS.
  * flag: 1=success, 0=failure.
  */
 static void
 sctp_asconf_addr_mgmt_ack(struct sctp_tcb *stcb, struct sctp_ifa *addr, uint32_t flag)
 {
 	/*
 	 * do the necessary asoc list work- if we get a failure indication,
 	 * leave the address on the assoc's restricted list.  If we get a
 	 * success indication, remove the address from the restricted list.
 	 */
 	/*
 	 * Note: this will only occur for ADD_IP_ADDRESS, since
 	 * DEL_IP_ADDRESS is never actually added to the list...
 	 */
 	if (flag) {
 		/* success case, so remove from the restricted list */
 		sctp_del_local_addr_restricted(stcb, addr);
 
 		if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_BASE) ||
 		    sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_FASTHANDOFF)) {
 			sctp_path_check_and_react(stcb, addr);
 			return;
 		}
 		/* clear any cached/topologically incorrect source addresses */
 		sctp_asconf_nets_cleanup(stcb, addr->ifn_p);
 	}
 	/* else, leave it on the list */
 }
 
 /*
  * add an asconf add/delete/set primary IP address parameter to the queue.
  * type = SCTP_ADD_IP_ADDRESS, SCTP_DEL_IP_ADDRESS, SCTP_SET_PRIM_ADDR.
  * returns 0 if queued, -1 if not queued/removed.
  * NOTE: if adding, but a delete for the same address is already scheduled
  * (and not yet sent out), simply remove it from queue.  Same for deleting
  * an address already scheduled for add.  If a duplicate operation is found,
  * ignore the new one.
  */
 static int
 sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa,
     uint16_t type)
 {
 	struct sctp_asconf_addr *aa, *aa_next;
 
 	/* make sure the request isn't already in the queue */
 	TAILQ_FOREACH_SAFE(aa, &stcb->asoc.asconf_queue, next, aa_next) {
 		/* address match? */
 		if (sctp_asconf_addr_match(aa, &ifa->address.sa) == 0)
 			continue;
 		/*
 		 * is the request already in queue but not sent? pass the
 		 * request already sent in order to resolve the following
 		 * case: 1. arrival of ADD, then sent 2. arrival of DEL. we
 		 * can't remove the ADD request already sent 3. arrival of
 		 * ADD
 		 */
 		if (aa->ap.aph.ph.param_type == type && aa->sent == 0) {
 			return (-1);
 		}
 		/* is the negative request already in queue, and not sent */
 		if ((aa->sent == 0) && (type == SCTP_ADD_IP_ADDRESS) &&
 		    (aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS)) {
 			/* add requested, delete already queued */
 			TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next);
 			/* remove the ifa from the restricted list */
 			sctp_del_local_addr_restricted(stcb, ifa);
 			/* free the asconf param */
 			SCTP_FREE(aa, SCTP_M_ASC_ADDR);
 			SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: add removes queued entry\n");
 			return (-1);
 		}
 		if ((aa->sent == 0) && (type == SCTP_DEL_IP_ADDRESS) &&
 		    (aa->ap.aph.ph.param_type == SCTP_ADD_IP_ADDRESS)) {
 			/* delete requested, add already queued */
 			TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next);
 			/* remove the aa->ifa from the restricted list */
 			sctp_del_local_addr_restricted(stcb, aa->ifa);
 			/* free the asconf param */
 			SCTP_FREE(aa, SCTP_M_ASC_ADDR);
 			SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: delete removes queued entry\n");
 			return (-1);
 		}
 	}			/* for each aa */
 
 	/* adding new request to the queue */
 	SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
 	    SCTP_M_ASC_ADDR);
 	if (aa == NULL) {
 		/* didn't get memory */
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_queue_mgmt: failed to get memory!\n");
 		return (-1);
 	}
 	aa->special_del = 0;
 	/* fill in asconf address parameter fields */
 	/* top level elements are "networked" during send */
 	aa->ap.aph.ph.param_type = type;
 	aa->ifa = ifa;
 	atomic_add_int(&ifa->refcount, 1);
 	/* correlation_id filled in during send routine later... */
 	switch (ifa->address.sa.sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = &ifa->address.sin6;
 			aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
 			aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param));
 			aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) +
 			    sizeof(struct sctp_ipv6addr_param);
 			memcpy(&aa->ap.addrp.addr, &sin6->sin6_addr,
 			    sizeof(struct in6_addr));
 			break;
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			sin = &ifa->address.sin;
 			aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
 			aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param));
 			aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) +
 			    sizeof(struct sctp_ipv4addr_param);
 			memcpy(&aa->ap.addrp.addr, &sin->sin_addr,
 			    sizeof(struct in_addr));
 			break;
 		}
 #endif
 	default:
 		/* invalid family! */
 		SCTP_FREE(aa, SCTP_M_ASC_ADDR);
 		sctp_free_ifa(ifa);
 		return (-1);
 	}
 	aa->sent = 0;		/* clear sent flag */
 
 	TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
 #ifdef SCTP_DEBUG
 	if (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_ASCONF2) {
 		if (type == SCTP_ADD_IP_ADDRESS) {
 			SCTP_PRINTF("asconf_queue_mgmt: inserted asconf ADD_IP_ADDRESS: ");
 			SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa);
 		} else if (type == SCTP_DEL_IP_ADDRESS) {
 			SCTP_PRINTF("asconf_queue_mgmt: appended asconf DEL_IP_ADDRESS: ");
 			SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa);
 		} else {
 			SCTP_PRINTF("asconf_queue_mgmt: appended asconf SET_PRIM_ADDR: ");
 			SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa);
 		}
 	}
 #endif
 
 	return (0);
 }
 
 
 /*
  * add an asconf operation for the given ifa and type.
  * type = SCTP_ADD_IP_ADDRESS, SCTP_DEL_IP_ADDRESS, SCTP_SET_PRIM_ADDR.
  * returns 0 if completed, -1 if not completed, 1 if immediate send is
  * advisable.
  */
 static int
 sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa,
     uint16_t type)
 {
 	uint32_t status;
 	int pending_delete_queued = 0;
 	int last;
 
 	/* see if peer supports ASCONF */
 	if (stcb->asoc.asconf_supported == 0) {
 		return (-1);
 	}
 
 	/*
 	 * if this is deleting the last address from the assoc, mark it as
 	 * pending.
 	 */
 	if ((type == SCTP_DEL_IP_ADDRESS) && !stcb->asoc.asconf_del_pending) {
 		if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 			last = (sctp_local_addr_count(stcb) == 0);
 		} else {
 			last = (sctp_local_addr_count(stcb) == 1);
 		}
 		if (last) {
 			/* set the pending delete info only */
 			stcb->asoc.asconf_del_pending = 1;
 			stcb->asoc.asconf_addr_del_pending = ifa;
 			atomic_add_int(&ifa->refcount, 1);
 			SCTPDBG(SCTP_DEBUG_ASCONF2,
 			    "asconf_queue_add: mark delete last address pending\n");
 			return (-1);
 		}
 	}
 
 	/* queue an asconf parameter */
 	status = sctp_asconf_queue_mgmt(stcb, ifa, type);
 
 	/*
 	 * if this is an add, and there is a delete also pending (i.e. the
 	 * last local address is being changed), queue the pending delete
 	 * too.
 	 */
 	if ((type == SCTP_ADD_IP_ADDRESS) && stcb->asoc.asconf_del_pending && (status == 0)) {
 		/* queue in the pending delete */
 		if (sctp_asconf_queue_mgmt(stcb,
 		    stcb->asoc.asconf_addr_del_pending,
 		    SCTP_DEL_IP_ADDRESS) == 0) {
 			SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_add: queuing pending delete\n");
 			pending_delete_queued = 1;
 			/* clear out the pending delete info */
 			stcb->asoc.asconf_del_pending = 0;
 			sctp_free_ifa(stcb->asoc.asconf_addr_del_pending);
 			stcb->asoc.asconf_addr_del_pending = NULL;
 		}
 	}
 
 	if (pending_delete_queued) {
 		struct sctp_nets *net;
 
 		/*
 		 * since we know that the only/last address is now being
 		 * changed in this case, reset the cwnd/rto on all nets to
 		 * start as a new address and path.  Also clear the error
 		 * counts to give the assoc the best chance to complete the
 		 * address change.
 		 */
 		TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 			stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb,
 			    net);
 			net->RTO = 0;
 			net->error_count = 0;
 		}
 		stcb->asoc.overall_error_count = 0;
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
 			sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
 			    stcb->asoc.overall_error_count,
 			    0,
 			    SCTP_FROM_SCTP_ASCONF,
 			    __LINE__);
 		}
 
 		/* queue in an advisory set primary too */
 		(void)sctp_asconf_queue_mgmt(stcb, ifa, SCTP_SET_PRIM_ADDR);
 		/* let caller know we should send this out immediately */
 		status = 1;
 	}
 	return (status);
 }
 
 /*-
  * add an asconf delete IP address parameter to the queue by sockaddr and
  * possibly with no sctp_ifa available.  This is only called by the routine
  * that checks the addresses in an INIT-ACK against the current address list.
  * returns 0 if completed, non-zero if not completed.
  * NOTE: if an add is already scheduled (and not yet sent out), simply
  * remove it from queue.  If a duplicate operation is found, ignore the
  * new one.
  */
 static int
 sctp_asconf_queue_sa_delete(struct sctp_tcb *stcb, struct sockaddr *sa)
 {
 	struct sctp_ifa *ifa;
 	struct sctp_asconf_addr *aa, *aa_next;
 
 	if (stcb == NULL) {
 		return (-1);
 	}
 	/* see if peer supports ASCONF */
 	if (stcb->asoc.asconf_supported == 0) {
 		return (-1);
 	}
 	/* make sure the request isn't already in the queue */
 	TAILQ_FOREACH_SAFE(aa, &stcb->asoc.asconf_queue, next, aa_next) {
 		/* address match? */
 		if (sctp_asconf_addr_match(aa, sa) == 0)
 			continue;
 		/* is the request already in queue (sent or not) */
 		if (aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS) {
 			return (-1);
 		}
 		/* is the negative request already in queue, and not sent */
 		if (aa->sent == 1)
 			continue;
 		if (aa->ap.aph.ph.param_type == SCTP_ADD_IP_ADDRESS) {
 			/* add already queued, so remove existing entry */
 			TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next);
 			sctp_del_local_addr_restricted(stcb, aa->ifa);
 			/* free the entry */
 			SCTP_FREE(aa, SCTP_M_ASC_ADDR);
 			return (-1);
 		}
 	}			/* for each aa */
 
 	/* find any existing ifa-- NOTE ifa CAN be allowed to be NULL */
 	ifa = sctp_find_ifa_by_addr(sa, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED);
 
 	/* adding new request to the queue */
 	SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
 	    SCTP_M_ASC_ADDR);
 	if (aa == NULL) {
 		/* didn't get memory */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "sctp_asconf_queue_sa_delete: failed to get memory!\n");
 		return (-1);
 	}
 	aa->special_del = 0;
 	/* fill in asconf address parameter fields */
 	/* top level elements are "networked" during send */
 	aa->ap.aph.ph.param_type = SCTP_DEL_IP_ADDRESS;
 	aa->ifa = ifa;
 	if (ifa)
 		atomic_add_int(&ifa->refcount, 1);
 	/* correlation_id filled in during send routine later... */
 	switch (sa->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			/* IPv6 address */
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)sa;
 			aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
 			aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param));
 			aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv6addr_param);
 			memcpy(&aa->ap.addrp.addr, &sin6->sin6_addr,
 			    sizeof(struct in6_addr));
 			break;
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			/* IPv4 address */
 			struct sockaddr_in *sin = (struct sockaddr_in *)sa;
 
 			aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
 			aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param));
 			aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv4addr_param);
 			memcpy(&aa->ap.addrp.addr, &sin->sin_addr,
 			    sizeof(struct in_addr));
 			break;
 		}
 #endif
 	default:
 		/* invalid family! */
 		SCTP_FREE(aa, SCTP_M_ASC_ADDR);
 		if (ifa)
 			sctp_free_ifa(ifa);
 		return (-1);
 	}
 	aa->sent = 0;		/* clear sent flag */
 
 	/* delete goes to the back of the queue */
 	TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
 
 	/* sa_ignore MEMLEAK {memory is put on the tailq} */
 	return (0);
 }
 
 /*
  * find a specific asconf param on our "sent" queue
  */
 static struct sctp_asconf_addr *
 sctp_asconf_find_param(struct sctp_tcb *stcb, uint32_t correlation_id)
 {
 	struct sctp_asconf_addr *aa;
 
 	TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) {
 		if (aa->ap.aph.correlation_id == correlation_id &&
 		    aa->sent == 1) {
 			/* found it */
 			return (aa);
 		}
 	}
 	/* didn't find it */
 	return (NULL);
 }
 
 /*
  * process an SCTP_ERROR_CAUSE_IND for a ASCONF-ACK parameter and do
  * notifications based on the error response
  */
 static void
 sctp_asconf_process_error(struct sctp_tcb *stcb SCTP_UNUSED,
     struct sctp_asconf_paramhdr *aph)
 {
 	struct sctp_error_cause *eh;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type;
 	uint16_t error_code;
 
 	eh = (struct sctp_error_cause *)(aph + 1);
 	ph = (struct sctp_paramhdr *)(eh + 1);
 	/* validate lengths */
 	if (htons(eh->length) + sizeof(struct sctp_error_cause) >
 	    htons(aph->ph.param_length)) {
 		/* invalid error cause length */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "asconf_process_error: cause element too long\n");
 		return;
 	}
 	if (htons(ph->param_length) + sizeof(struct sctp_paramhdr) >
 	    htons(eh->length)) {
 		/* invalid included TLV length */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "asconf_process_error: included TLV too long\n");
 		return;
 	}
 	/* which error code ? */
 	error_code = ntohs(eh->code);
 	param_type = ntohs(aph->ph.param_type);
 	/* FIX: this should go back up the REMOTE_ERROR ULP notify */
 	switch (error_code) {
 	case SCTP_CAUSE_RESOURCE_SHORTAGE:
 		/* we allow ourselves to "try again" for this error */
 		break;
 	default:
 		/* peer can't handle it... */
 		switch (param_type) {
 		case SCTP_ADD_IP_ADDRESS:
 		case SCTP_DEL_IP_ADDRESS:
 		case SCTP_SET_PRIM_ADDR:
 			break;
 		default:
 			break;
 		}
 	}
 }
 
 /*
  * process an asconf queue param.
  * aparam: parameter to process, will be removed from the queue.
  * flag: 1=success case, 0=failure case
  */
 static void
 sctp_asconf_process_param_ack(struct sctp_tcb *stcb,
     struct sctp_asconf_addr *aparam, uint32_t flag)
 {
 	uint16_t param_type;
 
 	/* process this param */
 	param_type = aparam->ap.aph.ph.param_type;
 	switch (param_type) {
 	case SCTP_ADD_IP_ADDRESS:
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_param_ack: added IP address\n");
 		sctp_asconf_addr_mgmt_ack(stcb, aparam->ifa, flag);
 		break;
 	case SCTP_DEL_IP_ADDRESS:
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_param_ack: deleted IP address\n");
 		/* nothing really to do... lists already updated */
 		break;
 	case SCTP_SET_PRIM_ADDR:
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_param_ack: set primary IP address\n");
 		/* nothing to do... peer may start using this addr */
 		break;
 	default:
 		/* should NEVER happen */
 		break;
 	}
 
 	/* remove the param and free it */
 	TAILQ_REMOVE(&stcb->asoc.asconf_queue, aparam, next);
 	if (aparam->ifa)
 		sctp_free_ifa(aparam->ifa);
 	SCTP_FREE(aparam, SCTP_M_ASC_ADDR);
 }
 
 /*
  * cleanup from a bad asconf ack parameter
  */
 static void
 sctp_asconf_ack_clear(struct sctp_tcb *stcb SCTP_UNUSED)
 {
 	/* assume peer doesn't really know how to do asconfs */
 	/* XXX we could free the pending queue here */
 
 }
 
 void
 sctp_handle_asconf_ack(struct mbuf *m, int offset,
     struct sctp_asconf_ack_chunk *cp, struct sctp_tcb *stcb,
     struct sctp_nets *net, int *abort_no_unlock)
 {
 	struct sctp_association *asoc;
 	uint32_t serial_num;
 	uint16_t ack_length;
 	struct sctp_asconf_paramhdr *aph;
 	struct sctp_asconf_addr *aa, *aa_next;
 	uint32_t last_error_id = 0;	/* last error correlation id */
 	uint32_t id;
 	struct sctp_asconf_addr *ap;
 
 	/* asconf param buffer */
 	uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE];
 
 	/* verify minimum length */
 	if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_asconf_ack_chunk)) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "handle_asconf_ack: chunk too small = %xh\n",
 		    ntohs(cp->ch.chunk_length));
 		return;
 	}
 	asoc = &stcb->asoc;
 	serial_num = ntohl(cp->serial_number);
 
 	/*
 	 * NOTE: we may want to handle this differently- currently, we will
 	 * abort when we get an ack for the expected serial number + 1 (eg.
 	 * we didn't send it), process an ack normally if it is the expected
 	 * serial number, and re-send the previous ack for *ALL* other
 	 * serial numbers
 	 */
 
 	/*
 	 * if the serial number is the next expected, but I didn't send it,
 	 * abort the asoc, since someone probably just hijacked us...
 	 */
 	if (serial_num == (asoc->asconf_seq_out + 1)) {
 		struct mbuf *op_err;
 		char msg[SCTP_DIAG_INFO_LEN];
 
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got unexpected next serial number! Aborting asoc!\n");
 		snprintf(msg, sizeof(msg), "Never sent serial number %8.8x",
 		    serial_num);
 		op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
 		sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
 		*abort_no_unlock = 1;
 		return;
 	}
 	if (serial_num != asoc->asconf_seq_out_acked + 1) {
 		/* got a duplicate/unexpected ASCONF-ACK */
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got duplicate/unexpected serial number = %xh (expected = %xh)\n",
 		    serial_num, asoc->asconf_seq_out_acked + 1);
 		return;
 	}
 
 	if (serial_num == asoc->asconf_seq_out - 1) {
 		/* stop our timer */
 		sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net,
 		    SCTP_FROM_SCTP_ASCONF + SCTP_LOC_5);
 	}
 
 	/* process the ASCONF-ACK contents */
 	ack_length = ntohs(cp->ch.chunk_length) -
 	    sizeof(struct sctp_asconf_ack_chunk);
 	offset += sizeof(struct sctp_asconf_ack_chunk);
 	/* process through all parameters */
 	while (ack_length >= sizeof(struct sctp_asconf_paramhdr)) {
 		unsigned int param_length, param_type;
 
 		/* get pointer to next asconf parameter */
 		aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset,
 		    sizeof(struct sctp_asconf_paramhdr), aparam_buf);
 		if (aph == NULL) {
 			/* can't get an asconf paramhdr */
 			sctp_asconf_ack_clear(stcb);
 			return;
 		}
 		param_type = ntohs(aph->ph.param_type);
 		param_length = ntohs(aph->ph.param_length);
 		if (param_length > ack_length) {
 			sctp_asconf_ack_clear(stcb);
 			return;
 		}
 		if (param_length < sizeof(struct sctp_paramhdr)) {
 			sctp_asconf_ack_clear(stcb);
 			return;
 		}
 		/* get the complete parameter... */
 		if (param_length > sizeof(aparam_buf)) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1,
 			    "param length (%u) larger than buffer size!\n", param_length);
 			sctp_asconf_ack_clear(stcb);
 			return;
 		}
 		aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf);
 		if (aph == NULL) {
 			sctp_asconf_ack_clear(stcb);
 			return;
 		}
 		/* correlation_id is transparent to peer, no ntohl needed */
 		id = aph->correlation_id;
 
 		switch (param_type) {
 		case SCTP_ERROR_CAUSE_IND:
 			last_error_id = id;
 			/* find the corresponding asconf param in our queue */
 			ap = sctp_asconf_find_param(stcb, id);
 			if (ap == NULL) {
 				/* hmm... can't find this in our queue! */
 				break;
 			}
 			/* process the parameter, failed flag */
 			sctp_asconf_process_param_ack(stcb, ap, 0);
 			/* process the error response */
 			sctp_asconf_process_error(stcb, aph);
 			break;
 		case SCTP_SUCCESS_REPORT:
 			/* find the corresponding asconf param in our queue */
 			ap = sctp_asconf_find_param(stcb, id);
 			if (ap == NULL) {
 				/* hmm... can't find this in our queue! */
 				break;
 			}
 			/* process the parameter, success flag */
 			sctp_asconf_process_param_ack(stcb, ap, 1);
 			break;
 		default:
 			break;
 		}		/* switch */
 
 		/* update remaining ASCONF-ACK message length to process */
 		ack_length -= SCTP_SIZE32(param_length);
 		if (ack_length <= 0) {
 			/* no more data in the mbuf chain */
 			break;
 		}
 		offset += SCTP_SIZE32(param_length);
 	}			/* while */
 
 	/*
 	 * if there are any "sent" params still on the queue, these are
 	 * implicitly "success", or "failed" (if we got an error back) ...
 	 * so process these appropriately
 	 *
 	 * we assume that the correlation_id's are monotonically increasing
 	 * beginning from 1 and that we don't have *that* many outstanding
 	 * at any given time
 	 */
 	if (last_error_id == 0)
 		last_error_id--;	/* set to "max" value */
 	TAILQ_FOREACH_SAFE(aa, &stcb->asoc.asconf_queue, next, aa_next) {
 		if (aa->sent == 1) {
 			/*
 			 * implicitly successful or failed if correlation_id
 			 * < last_error_id, then success else, failure
 			 */
 			if (aa->ap.aph.correlation_id < last_error_id)
 				sctp_asconf_process_param_ack(stcb, aa, 1);
 			else
 				sctp_asconf_process_param_ack(stcb, aa, 0);
 		} else {
 			/*
 			 * since we always process in order (FIFO queue) if
 			 * we reach one that hasn't been sent, the rest
 			 * should not have been sent either. so, we're
 			 * done...
 			 */
 			break;
 		}
 	}
 
 	/* update the next sequence number to use */
 	asoc->asconf_seq_out_acked++;
 	/* remove the old ASCONF on our outbound queue */
 	sctp_toss_old_asconf(stcb);
 	if (!TAILQ_EMPTY(&stcb->asoc.asconf_queue)) {
 #ifdef SCTP_TIMER_BASED_ASCONF
 		/* we have more params, so restart our timer */
 		sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep,
 		    stcb, net);
 #else
 		/* we have more params, so send out more */
 		sctp_send_asconf(stcb, net, SCTP_ADDR_NOT_LOCKED);
 #endif
 	}
 }
 
 #ifdef INET6
 static uint32_t
 sctp_is_scopeid_in_nets(struct sctp_tcb *stcb, struct sockaddr *sa)
 {
 	struct sockaddr_in6 *sin6, *net6;
 	struct sctp_nets *net;
 
 	if (sa->sa_family != AF_INET6) {
 		/* wrong family */
 		return (0);
 	}
 	sin6 = (struct sockaddr_in6 *)sa;
 	if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) == 0) {
 		/* not link local address */
 		return (0);
 	}
 	/* hunt through our destination nets list for this scope_id */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		if (((struct sockaddr *)(&net->ro._l_addr))->sa_family !=
 		    AF_INET6)
 			continue;
 		net6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 		if (IN6_IS_ADDR_LINKLOCAL(&net6->sin6_addr) == 0)
 			continue;
 		if (sctp_is_same_scope(sin6, net6)) {
 			/* found one */
 			return (1);
 		}
 	}
 	/* didn't find one */
 	return (0);
 }
 #endif
 
 /*
  * address management functions
  */
 static void
 sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     struct sctp_ifa *ifa, uint16_t type, int addr_locked)
 {
 	int status;
 
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0 ||
 	    sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) {
 		/* subset bound, no ASCONF allowed case, so ignore */
 		return;
 	}
 	/*
 	 * note: we know this is not the subset bound, no ASCONF case eg.
 	 * this is boundall or subset bound w/ASCONF allowed
 	 */
 
 	/* first, make sure that the address is IPv4 or IPv6 and not jailed */
 	switch (ifa->address.sa.sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 		    &ifa->address.sin6.sin6_addr) != 0) {
 			return;
 		}
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 		    &ifa->address.sin.sin_addr) != 0) {
 			return;
 		}
 		break;
 #endif
 	default:
 		return;
 	}
 #ifdef INET6
 	/* make sure we're "allowed" to add this type of addr */
 	if (ifa->address.sa.sa_family == AF_INET6) {
 		/* invalid if we're not a v6 endpoint */
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0)
 			return;
 		/* is the v6 addr really valid ? */
 		if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 			return;
 		}
 	}
 #endif
 	/* put this address on the "pending/do not use yet" list */
 	sctp_add_local_addr_restricted(stcb, ifa);
 	/*
 	 * check address scope if address is out of scope, don't queue
 	 * anything... note: this would leave the address on both inp and
 	 * asoc lists
 	 */
 	switch (ifa->address.sa.sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = &ifa->address.sin6;
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 				/* we skip unspecifed addresses */
 				return;
 			}
 			if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 				if (stcb->asoc.scope.local_scope == 0) {
 					return;
 				}
 				/* is it the right link local scope? */
 				if (sctp_is_scopeid_in_nets(stcb, &ifa->address.sa) == 0) {
 					return;
 				}
 			}
 			if (stcb->asoc.scope.site_scope == 0 &&
 			    IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) {
 				return;
 			}
 			break;
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			/* invalid if we are a v6 only endpoint */
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 			    SCTP_IPV6_V6ONLY(inp))
 				return;
 
 			sin = &ifa->address.sin;
 			if (sin->sin_addr.s_addr == 0) {
 				/* we skip unspecifed addresses */
 				return;
 			}
 			if (stcb->asoc.scope.ipv4_local_scope == 0 &&
 			    IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
 				return;
 			}
 			break;
 		}
 #endif
 	default:
 		/* else, not AF_INET or AF_INET6, so skip */
 		return;
 	}
 
 	/* queue an asconf for this address add/delete */
 	if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF)) {
 		/* does the peer do asconf? */
 		if (stcb->asoc.asconf_supported) {
 			/* queue an asconf for this addr */
 			status = sctp_asconf_queue_add(stcb, ifa, type);
 
 			/*
 			 * if queued ok, and in the open state, send out the
 			 * ASCONF.  If in the non-open state, these will be
 			 * sent when the state goes open.
 			 */
 			if (status == 0 &&
 			    ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 			    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED))) {
 #ifdef SCTP_TIMER_BASED_ASCONF
 				sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp,
 				    stcb, stcb->asoc.primary_destination);
 #else
 				sctp_send_asconf(stcb, NULL, addr_locked);
 #endif
 			}
 		}
 	}
 }
 
 
 int
 sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr, uint32_t val SCTP_UNUSED)
 {
 	struct sctp_asconf_iterator *asc;
 	struct sctp_ifa *ifa;
 	struct sctp_laddr *l;
 	int cnt_invalid = 0;
 
 	asc = (struct sctp_asconf_iterator *)ptr;
 	LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) {
 		ifa = l->ifa;
 		switch (ifa->address.sa.sa_family) {
 #ifdef INET6
 		case AF_INET6:
 			/* invalid if we're not a v6 endpoint */
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
 				cnt_invalid++;
 				if (asc->cnt == cnt_invalid)
 					return (1);
 			}
 			break;
 #endif
 #ifdef INET
 		case AF_INET:
 			{
 				/* invalid if we are a v6 only endpoint */
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 				    SCTP_IPV6_V6ONLY(inp)) {
 					cnt_invalid++;
 					if (asc->cnt == cnt_invalid)
 						return (1);
 				}
 				break;
 			}
 #endif
 		default:
 			/* invalid address family */
 			cnt_invalid++;
 			if (asc->cnt == cnt_invalid)
 				return (1);
 		}
 	}
 	return (0);
 }
 
 static int
 sctp_asconf_iterator_ep_end(struct sctp_inpcb *inp, void *ptr, uint32_t val SCTP_UNUSED)
 {
 	struct sctp_ifa *ifa;
 	struct sctp_asconf_iterator *asc;
 	struct sctp_laddr *laddr, *nladdr, *l;
 
 	/* Only for specific case not bound all */
 	asc = (struct sctp_asconf_iterator *)ptr;
 	LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) {
 		ifa = l->ifa;
 		if (l->action == SCTP_ADD_IP_ADDRESS) {
 			LIST_FOREACH(laddr, &inp->sctp_addr_list,
 			    sctp_nxt_addr) {
 				if (laddr->ifa == ifa) {
 					laddr->action = 0;
 					break;
 				}
 
 			}
 		} else if (l->action == SCTP_DEL_IP_ADDRESS) {
 			LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) {
 				/* remove only after all guys are done */
 				if (laddr->ifa == ifa) {
 					sctp_del_local_addr_ep(inp, ifa);
 				}
 			}
 		}
 	}
 	return (0);
 }
 
 void
 sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     void *ptr, uint32_t val SCTP_UNUSED)
 {
 	struct sctp_asconf_iterator *asc;
 	struct sctp_ifa *ifa;
 	struct sctp_laddr *l;
 	int cnt_invalid = 0;
 	int type, status;
 	int num_queued = 0;
 
 	asc = (struct sctp_asconf_iterator *)ptr;
 	LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) {
 		ifa = l->ifa;
 		type = l->action;
 
 		/* address's vrf_id must be the vrf_id of the assoc */
 		if (ifa->vrf_id != stcb->asoc.vrf_id) {
 			continue;
 		}
 
 		/* Same checks again for assoc */
 		switch (ifa->address.sa.sa_family) {
 #ifdef INET6
 		case AF_INET6:
 			{
 				/* invalid if we're not a v6 endpoint */
 				struct sockaddr_in6 *sin6;
 
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
 					cnt_invalid++;
 					if (asc->cnt == cnt_invalid)
 						return;
 					else
 						continue;
 				}
 				sin6 = &ifa->address.sin6;
 				if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 					/* we skip unspecifed addresses */
 					continue;
 				}
 				if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 				    &sin6->sin6_addr) != 0) {
 					continue;
 				}
 				if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 					if (stcb->asoc.scope.local_scope == 0) {
 						continue;
 					}
 					/* is it the right link local scope? */
 					if (sctp_is_scopeid_in_nets(stcb, &ifa->address.sa) == 0) {
 						continue;
 					}
 				}
 				break;
 			}
 #endif
 #ifdef INET
 		case AF_INET:
 			{
 				/* invalid if we are a v6 only endpoint */
 				struct sockaddr_in *sin;
 
 				/* invalid if we are a v6 only endpoint */
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 				    SCTP_IPV6_V6ONLY(inp))
 					continue;
 
 				sin = &ifa->address.sin;
 				if (sin->sin_addr.s_addr == 0) {
 					/* we skip unspecifed addresses */
 					continue;
 				}
 				if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 				    &sin->sin_addr) != 0) {
 					continue;
 				}
 				if (stcb->asoc.scope.ipv4_local_scope == 0 &&
 				    IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
 					continue;
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 				    SCTP_IPV6_V6ONLY(inp)) {
 					cnt_invalid++;
 					if (asc->cnt == cnt_invalid)
 						return;
 					else
 						continue;
 				}
 				break;
 			}
 #endif
 		default:
 			/* invalid address family */
 			cnt_invalid++;
 			if (asc->cnt == cnt_invalid)
 				return;
 			else
 				continue;
 			break;
 		}
 
 		if (type == SCTP_ADD_IP_ADDRESS) {
 			/* prevent this address from being used as a source */
 			sctp_add_local_addr_restricted(stcb, ifa);
 		} else if (type == SCTP_DEL_IP_ADDRESS) {
 			struct sctp_nets *net;
 
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 				sctp_rtentry_t *rt;
 
 				/* delete this address if cached */
 				if (net->ro._s_addr == ifa) {
 					sctp_free_ifa(net->ro._s_addr);
 					net->ro._s_addr = NULL;
 					net->src_addr_selected = 0;
 					rt = net->ro.ro_rt;
 					if (rt) {
 						RTFREE(rt);
 						net->ro.ro_rt = NULL;
 					}
 					/*
 					 * Now we deleted our src address,
 					 * should we not also now reset the
 					 * cwnd/rto to start as if its a new
 					 * address?
 					 */
 					stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
 					net->RTO = 0;
 
 				}
 			}
 		} else if (type == SCTP_SET_PRIM_ADDR) {
 			if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 				/* must validate the ifa is in the ep */
 				if (sctp_is_addr_in_ep(stcb->sctp_ep, ifa) == 0) {
 					continue;
 				}
 			} else {
 				/* Need to check scopes for this guy */
 				if (sctp_is_address_in_scope(ifa, &stcb->asoc.scope, 0) == 0) {
 					continue;
 				}
 			}
 		}
 		/* queue an asconf for this address add/delete */
 		if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF) &&
 		    stcb->asoc.asconf_supported == 1) {
 			/* queue an asconf for this addr */
 			status = sctp_asconf_queue_add(stcb, ifa, type);
 			/*
 			 * if queued ok, and in the open state, update the
 			 * count of queued params.  If in the non-open
 			 * state, these get sent when the assoc goes open.
 			 */
 			if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 			    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 				if (status >= 0) {
 					num_queued++;
 				}
 			}
 		}
 	}
 	/*
 	 * If we have queued params in the open state, send out an ASCONF.
 	 */
 	if (num_queued > 0) {
 		sctp_send_asconf(stcb, NULL, SCTP_ADDR_NOT_LOCKED);
 	}
 }
 
 void
 sctp_asconf_iterator_end(void *ptr, uint32_t val SCTP_UNUSED)
 {
 	struct sctp_asconf_iterator *asc;
 	struct sctp_ifa *ifa;
 	struct sctp_laddr *l, *nl;
 
 	asc = (struct sctp_asconf_iterator *)ptr;
 	LIST_FOREACH_SAFE(l, &asc->list_of_work, sctp_nxt_addr, nl) {
 		ifa = l->ifa;
 		if (l->action == SCTP_ADD_IP_ADDRESS) {
 			/* Clear the defer use flag */
 			ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE;
 		}
 		sctp_free_ifa(ifa);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), l);
 		SCTP_DECR_LADDR_COUNT();
 	}
 	SCTP_FREE(asc, SCTP_M_ASC_IT);
 }
 
 /*
  * sa is the sockaddr to ask the peer to set primary to.
  * returns: 0 = completed, -1 = error
  */
 int32_t
 sctp_set_primary_ip_address_sa(struct sctp_tcb *stcb, struct sockaddr *sa)
 {
 	uint32_t vrf_id;
 	struct sctp_ifa *ifa;
 
 	/* find the ifa for the desired set primary */
 	vrf_id = stcb->asoc.vrf_id;
 	ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED);
 	if (ifa == NULL) {
 		/* Invalid address */
 		return (-1);
 	}
 
 	/* queue an ASCONF:SET_PRIM_ADDR to be sent */
 	if (!sctp_asconf_queue_add(stcb, ifa, SCTP_SET_PRIM_ADDR)) {
 		/* set primary queuing succeeded */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "set_primary_ip_address_sa: queued on tcb=%p, ",
 		    (void *)stcb);
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 		    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 #ifdef SCTP_TIMER_BASED_ASCONF
 			sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
 			    stcb->sctp_ep, stcb,
 			    stcb->asoc.primary_destination);
 #else
 			sctp_send_asconf(stcb, NULL, SCTP_ADDR_NOT_LOCKED);
 #endif
 		}
 	} else {
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address_sa: failed to add to queue on tcb=%p, ",
 		    (void *)stcb);
 		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
 		return (-1);
 	}
 	return (0);
 }
 
 int
 sctp_is_addr_pending(struct sctp_tcb *stcb, struct sctp_ifa *sctp_ifa)
 {
 	struct sctp_tmit_chunk *chk, *nchk;
 	unsigned int offset, asconf_limit;
 	struct sctp_asconf_chunk *acp;
 	struct sctp_asconf_paramhdr *aph;
 	uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_paramhdr *ph;
 	int add_cnt, del_cnt;
 	uint16_t last_param_type;
 
 	add_cnt = del_cnt = 0;
 	last_param_type = 0;
 	TAILQ_FOREACH_SAFE(chk, &stcb->asoc.asconf_send_queue, sctp_next, nchk) {
 		if (chk->data == NULL) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: No mbuf data?\n");
 			continue;
 		}
 		offset = 0;
 		acp = mtod(chk->data, struct sctp_asconf_chunk *);
 		offset += sizeof(struct sctp_asconf_chunk);
 		asconf_limit = ntohs(acp->ch.chunk_length);
 		ph = (struct sctp_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_paramhdr), aparam_buf);
 		if (ph == NULL) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get lookup addr!\n");
 			continue;
 		}
 		offset += ntohs(ph->param_length);
 
 		aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf);
 		if (aph == NULL) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: Empty ASCONF will be sent?\n");
 			continue;
 		}
 		while (aph != NULL) {
 			unsigned int param_length, param_type;
 
 			param_type = ntohs(aph->ph.param_type);
 			param_length = ntohs(aph->ph.param_length);
 			if (offset + param_length > asconf_limit) {
 				/* parameter goes beyond end of chunk! */
 				break;
 			}
 			if (param_length > sizeof(aparam_buf)) {
 				SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length (%u) larger than buffer size!\n", param_length);
 				break;
 			}
 			if (param_length <= sizeof(struct sctp_paramhdr)) {
 				SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length(%u) too short\n", param_length);
 				break;
 			}
 
 			aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, param_length, aparam_buf);
 			if (aph == NULL) {
 				SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get entire param\n");
 				break;
 			}
 
 			ph = (struct sctp_paramhdr *)(aph + 1);
 			if (sctp_addr_match(ph, &sctp_ifa->address.sa) != 0) {
 				switch (param_type) {
 				case SCTP_ADD_IP_ADDRESS:
 					add_cnt++;
 					break;
 				case SCTP_DEL_IP_ADDRESS:
 					del_cnt++;
 					break;
 				default:
 					break;
 				}
 				last_param_type = param_type;
 			}
 
 			offset += SCTP_SIZE32(param_length);
 			if (offset >= asconf_limit) {
 				/* no more data in the mbuf chain */
 				break;
 			}
 			/* get pointer to next asconf param */
 			aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf);
 		}
 	}
 
 	/*
 	 * we want to find the sequences which consist of ADD -> DEL -> ADD
 	 * or DEL -> ADD
 	 */
 	if (add_cnt > del_cnt ||
 	    (add_cnt == del_cnt && last_param_type == SCTP_ADD_IP_ADDRESS)) {
 		return (1);
 	}
 	return (0);
 }
 
 static struct sockaddr *
 sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked)
 {
 	struct sctp_vrf *vrf = NULL;
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa;
 
 	if (addr_locked == SCTP_ADDR_NOT_LOCKED)
 		SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(stcb->asoc.vrf_id);
 	if (vrf == NULL) {
 		if (addr_locked == SCTP_ADDR_NOT_LOCKED)
 			SCTP_IPI_ADDR_RUNLOCK();
 		return (NULL);
 	}
 	LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 		if (stcb->asoc.scope.loopback_scope == 0 &&
 		    SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 			/* Skip if loopback_scope not set */
 			continue;
 		}
 		LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 			switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				if (stcb->asoc.scope.ipv4_addr_legal) {
 					struct sockaddr_in *sin;
 
 					sin = &sctp_ifa->address.sin;
 					if (sin->sin_addr.s_addr == 0) {
 						/* skip unspecifed addresses */
 						continue;
 					}
 					if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred,
 					    &sin->sin_addr) != 0) {
 						continue;
 					}
 					if (stcb->asoc.scope.ipv4_local_scope == 0 &&
 					    IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))
 						continue;
 
 					if (sctp_is_addr_restricted(stcb, sctp_ifa) &&
 					    (!sctp_is_addr_pending(stcb, sctp_ifa)))
 						continue;
 					/*
 					 * found a valid local v4 address to
 					 * use
 					 */
 					if (addr_locked == SCTP_ADDR_NOT_LOCKED)
 						SCTP_IPI_ADDR_RUNLOCK();
 					return (&sctp_ifa->address.sa);
 				}
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				if (stcb->asoc.scope.ipv6_addr_legal) {
 					struct sockaddr_in6 *sin6;
 
 					if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 						continue;
 					}
 
 					sin6 = &sctp_ifa->address.sin6;
 					if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						/*
 						 * we skip unspecifed
 						 * addresses
 						 */
 						continue;
 					}
 					if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred,
 					    &sin6->sin6_addr) != 0) {
 						continue;
 					}
 					if (stcb->asoc.scope.local_scope == 0 &&
 					    IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
 						continue;
 					if (stcb->asoc.scope.site_scope == 0 &&
 					    IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))
 						continue;
 
 					if (sctp_is_addr_restricted(stcb, sctp_ifa) &&
 					    (!sctp_is_addr_pending(stcb, sctp_ifa)))
 						continue;
 					/*
 					 * found a valid local v6 address to
 					 * use
 					 */
 					if (addr_locked == SCTP_ADDR_NOT_LOCKED)
 						SCTP_IPI_ADDR_RUNLOCK();
 					return (&sctp_ifa->address.sa);
 				}
 				break;
 #endif
 			default:
 				break;
 			}
 		}
 	}
 	/* no valid addresses found */
 	if (addr_locked == SCTP_ADDR_NOT_LOCKED)
 		SCTP_IPI_ADDR_RUNLOCK();
 	return (NULL);
 }
 
 static struct sockaddr *
 sctp_find_valid_localaddr_ep(struct sctp_tcb *stcb)
 {
 	struct sctp_laddr *laddr;
 
 	LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == NULL) {
 			continue;
 		}
 		/* is the address restricted ? */
 		if (sctp_is_addr_restricted(stcb, laddr->ifa) &&
 		    (!sctp_is_addr_pending(stcb, laddr->ifa)))
 			continue;
 
 		/* found a valid local address to use */
 		return (&laddr->ifa->address.sa);
 	}
 	/* no valid addresses found */
 	return (NULL);
 }
 
 /*
  * builds an ASCONF chunk from queued ASCONF params.
  * returns NULL on error (no mbuf, no ASCONF params queued, etc).
  */
 struct mbuf *
 sctp_compose_asconf(struct sctp_tcb *stcb, int *retlen, int addr_locked)
 {
 	struct mbuf *m_asconf, *m_asconf_chk;
 	struct sctp_asconf_addr *aa;
 	struct sctp_asconf_chunk *acp;
 	struct sctp_asconf_paramhdr *aph;
 	struct sctp_asconf_addr_param *aap;
 	uint32_t p_length;
 	uint32_t correlation_id = 1;	/* 0 is reserved... */
 	caddr_t ptr, lookup_ptr;
 	uint8_t lookup_used = 0;
 
 	/* are there any asconf params to send? */
 	TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) {
 		if (aa->sent == 0)
 			break;
 	}
 	if (aa == NULL)
 		return (NULL);
 
 	/*
 	 * get a chunk header mbuf and a cluster for the asconf params since
 	 * it's simpler to fill in the asconf chunk header lookup address on
 	 * the fly
 	 */
 	m_asconf_chk = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_chunk), 0, M_NOWAIT, 1, MT_DATA);
 	if (m_asconf_chk == NULL) {
 		/* no mbuf's */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "compose_asconf: couldn't get chunk mbuf!\n");
 		return (NULL);
 	}
 	m_asconf = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (m_asconf == NULL) {
 		/* no mbuf's */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "compose_asconf: couldn't get mbuf!\n");
 		sctp_m_freem(m_asconf_chk);
 		return (NULL);
 	}
 	SCTP_BUF_LEN(m_asconf_chk) = sizeof(struct sctp_asconf_chunk);
 	SCTP_BUF_LEN(m_asconf) = 0;
 	acp = mtod(m_asconf_chk, struct sctp_asconf_chunk *);
 	memset(acp, 0, sizeof(struct sctp_asconf_chunk));
 	/* save pointers to lookup address and asconf params */
 	lookup_ptr = (caddr_t)(acp + 1);	/* after the header */
 	ptr = mtod(m_asconf, caddr_t);	/* beginning of cluster */
 
 	/* fill in chunk header info */
 	acp->ch.chunk_type = SCTP_ASCONF;
 	acp->ch.chunk_flags = 0;
 	acp->serial_number = htonl(stcb->asoc.asconf_seq_out);
 	stcb->asoc.asconf_seq_out++;
 
 	/* add parameters... up to smallest MTU allowed */
 	TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) {
 		if (aa->sent)
 			continue;
 		/* get the parameter length */
 		p_length = SCTP_SIZE32(aa->ap.aph.ph.param_length);
 		/* will it fit in current chunk? */
 		if ((SCTP_BUF_LEN(m_asconf) + p_length > stcb->asoc.smallest_mtu) ||
 		    (SCTP_BUF_LEN(m_asconf) + p_length > MCLBYTES)) {
 			/* won't fit, so we're done with this chunk */
 			break;
 		}
 		/* assign (and store) a correlation id */
 		aa->ap.aph.correlation_id = correlation_id++;
 
 		/*
 		 * fill in address if we're doing a delete this is a simple
 		 * way for us to fill in the correlation address, which
 		 * should only be used by the peer if we're deleting our
 		 * source address and adding a new address (e.g. renumbering
 		 * case)
 		 */
 		if (lookup_used == 0 &&
 		    (aa->special_del == 0) &&
 		    aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS) {
 			struct sctp_ipv6addr_param *lookup;
 			uint16_t p_size, addr_size;
 
 			lookup = (struct sctp_ipv6addr_param *)lookup_ptr;
 			lookup->ph.param_type =
 			    htons(aa->ap.addrp.ph.param_type);
 			if (aa->ap.addrp.ph.param_type == SCTP_IPV6_ADDRESS) {
 				/* copy IPv6 address */
 				p_size = sizeof(struct sctp_ipv6addr_param);
 				addr_size = sizeof(struct in6_addr);
 			} else {
 				/* copy IPv4 address */
 				p_size = sizeof(struct sctp_ipv4addr_param);
 				addr_size = sizeof(struct in_addr);
 			}
 			lookup->ph.param_length = htons(SCTP_SIZE32(p_size));
 			memcpy(lookup->addr, &aa->ap.addrp.addr, addr_size);
 			SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(p_size);
 			lookup_used = 1;
 		}
 		/* copy into current space */
 		memcpy(ptr, &aa->ap, p_length);
 
 		/* network elements and update lengths */
 		aph = (struct sctp_asconf_paramhdr *)ptr;
 		aap = (struct sctp_asconf_addr_param *)ptr;
 		/* correlation_id is transparent to peer, no htonl needed */
 		aph->ph.param_type = htons(aph->ph.param_type);
 		aph->ph.param_length = htons(aph->ph.param_length);
 		aap->addrp.ph.param_type = htons(aap->addrp.ph.param_type);
 		aap->addrp.ph.param_length = htons(aap->addrp.ph.param_length);
 
 		SCTP_BUF_LEN(m_asconf) += SCTP_SIZE32(p_length);
 		ptr += SCTP_SIZE32(p_length);
 
 		/*
 		 * these params are removed off the pending list upon
 		 * getting an ASCONF-ACK back from the peer, just set flag
 		 */
 		aa->sent = 1;
 	}
 	/* check to see if the lookup addr has been populated yet */
 	if (lookup_used == 0) {
 		/* NOTE: if the address param is optional, can skip this... */
 		/* add any valid (existing) address... */
 		struct sctp_ipv6addr_param *lookup;
 		uint16_t p_size, addr_size;
 		struct sockaddr *found_addr;
 		caddr_t addr_ptr;
 
 		if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)
 			found_addr = sctp_find_valid_localaddr(stcb,
 			    addr_locked);
 		else
 			found_addr = sctp_find_valid_localaddr_ep(stcb);
 
 		lookup = (struct sctp_ipv6addr_param *)lookup_ptr;
 		if (found_addr != NULL) {
 			switch (found_addr->sa_family) {
 #ifdef INET6
 			case AF_INET6:
 				/* copy IPv6 address */
 				lookup->ph.param_type =
 				    htons(SCTP_IPV6_ADDRESS);
 				p_size = sizeof(struct sctp_ipv6addr_param);
 				addr_size = sizeof(struct in6_addr);
 				addr_ptr = (caddr_t)&((struct sockaddr_in6 *)
 				    found_addr)->sin6_addr;
 				break;
 #endif
 #ifdef INET
 			case AF_INET:
 				/* copy IPv4 address */
 				lookup->ph.param_type =
 				    htons(SCTP_IPV4_ADDRESS);
 				p_size = sizeof(struct sctp_ipv4addr_param);
 				addr_size = sizeof(struct in_addr);
 				addr_ptr = (caddr_t)&((struct sockaddr_in *)
 				    found_addr)->sin_addr;
 				break;
 #endif
 			default:
 				p_size = 0;
 				addr_size = 0;
 				addr_ptr = NULL;
 				break;
 			}
 			lookup->ph.param_length = htons(SCTP_SIZE32(p_size));
 			memcpy(lookup->addr, addr_ptr, addr_size);
 			SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(p_size);
 		} else {
 			/* uh oh... don't have any address?? */
 			SCTPDBG(SCTP_DEBUG_ASCONF1,
 			    "compose_asconf: no lookup addr!\n");
 			/* XXX for now, we send a IPv4 address of 0.0.0.0 */
 			lookup->ph.param_type = htons(SCTP_IPV4_ADDRESS);
 			lookup->ph.param_length = htons(SCTP_SIZE32(sizeof(struct sctp_ipv4addr_param)));
 			memset(lookup->addr, 0, sizeof(struct in_addr));
 			SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(sizeof(struct sctp_ipv4addr_param));
 		}
 	}
 	/* chain it all together */
 	SCTP_BUF_NEXT(m_asconf_chk) = m_asconf;
 	*retlen = SCTP_BUF_LEN(m_asconf_chk) + SCTP_BUF_LEN(m_asconf);
 	acp->ch.chunk_length = htons(*retlen);
 
 	return (m_asconf_chk);
 }
 
 /*
  * section to handle address changes before an association is up eg. changes
  * during INIT/INIT-ACK/COOKIE-ECHO handshake
  */
 
 /*
  * processes the (local) addresses in the INIT-ACK chunk
  */
 static void
 sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m,
     unsigned int offset, unsigned int length)
 {
 	struct sctp_paramhdr tmp_param, *ph;
 	uint16_t plen, ptype;
 	struct sctp_ifa *sctp_ifa;
 	union sctp_sockstore store;
 #ifdef INET6
 	struct sctp_ipv6addr_param addr6_store;
 #endif
 #ifdef INET
 	struct sctp_ipv4addr_param addr4_store;
 #endif
 
 	SCTPDBG(SCTP_DEBUG_ASCONF2, "processing init-ack addresses\n");
 	if (stcb == NULL)	/* Un-needed check for SA */
 		return;
 
 	/* convert to upper bound */
 	length += offset;
 
 	if ((offset + sizeof(struct sctp_paramhdr)) > length) {
 		return;
 	}
 	/* go through the addresses in the init-ack */
 	ph = (struct sctp_paramhdr *)
 	    sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr),
 	    (uint8_t *)&tmp_param);
 	while (ph != NULL) {
 		ptype = ntohs(ph->param_type);
 		plen = ntohs(ph->param_length);
 		switch (ptype) {
 #ifdef INET6
 		case SCTP_IPV6_ADDRESS:
 			{
 				struct sctp_ipv6addr_param *a6p;
 
 				/* get the entire IPv6 address param */
 				a6p = (struct sctp_ipv6addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv6addr_param),
 				    (uint8_t *)&addr6_store);
 				if (plen != sizeof(struct sctp_ipv6addr_param) ||
 				    a6p == NULL) {
 					return;
 				}
 				memset(&store, 0, sizeof(union sctp_sockstore));
 				store.sin6.sin6_family = AF_INET6;
 				store.sin6.sin6_len = sizeof(struct sockaddr_in6);
 				store.sin6.sin6_port = stcb->rport;
 				memcpy(&store.sin6.sin6_addr, a6p->addr, sizeof(struct in6_addr));
 				break;
 			}
 #endif
 #ifdef INET
 		case SCTP_IPV4_ADDRESS:
 			{
 				struct sctp_ipv4addr_param *a4p;
 
 				/* get the entire IPv4 address param */
 				a4p = (struct sctp_ipv4addr_param *)sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv4addr_param),
 				    (uint8_t *)&addr4_store);
 				if (plen != sizeof(struct sctp_ipv4addr_param) ||
 				    a4p == NULL) {
 					return;
 				}
 				memset(&store, 0, sizeof(union sctp_sockstore));
 				store.sin.sin_family = AF_INET;
 				store.sin.sin_len = sizeof(struct sockaddr_in);
 				store.sin.sin_port = stcb->rport;
 				store.sin.sin_addr.s_addr = a4p->addr;
 				break;
 			}
 #endif
 		default:
 			goto next_addr;
 		}
 
 		/* see if this address really (still) exists */
 		sctp_ifa = sctp_find_ifa_by_addr(&store.sa, stcb->asoc.vrf_id,
 		    SCTP_ADDR_NOT_LOCKED);
 		if (sctp_ifa == NULL) {
 			/* address doesn't exist anymore */
 			int status;
 
 			/* are ASCONFs allowed ? */
 			if ((sctp_is_feature_on(stcb->sctp_ep,
 			    SCTP_PCB_FLAGS_DO_ASCONF)) &&
 			    stcb->asoc.asconf_supported) {
 				/* queue an ASCONF DEL_IP_ADDRESS */
 				status = sctp_asconf_queue_sa_delete(stcb, &store.sa);
 				/*
 				 * if queued ok, and in correct state, send
 				 * out the ASCONF.
 				 */
 				if (status == 0 &&
 				    SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) {
 #ifdef SCTP_TIMER_BASED_ASCONF
 					sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
 					    stcb->sctp_ep, stcb,
 					    stcb->asoc.primary_destination);
 #else
 					sctp_send_asconf(stcb, NULL, SCTP_ADDR_NOT_LOCKED);
 #endif
 				}
 			}
 		}
 
 next_addr:
 		/*
 		 * Sanity check:  Make sure the length isn't 0, otherwise
 		 * we'll be stuck in this loop for a long time...
 		 */
 		if (SCTP_SIZE32(plen) == 0) {
 			SCTP_PRINTF("process_initack_addrs: bad len (%d) type=%xh\n",
 			    plen, ptype);
 			return;
 		}
 		/* get next parameter */
 		offset += SCTP_SIZE32(plen);
 		if ((offset + sizeof(struct sctp_paramhdr)) > length)
 			return;
 		ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset,
 		    sizeof(struct sctp_paramhdr), (uint8_t *)&tmp_param);
 	}			/* while */
 }
 
 /* FIX ME: need to verify return result for v6 address type if v6 disabled */
 /*
  * checks to see if a specific address is in the initack address list returns
  * 1 if found, 0 if not
  */
 static uint32_t
 sctp_addr_in_initack(struct mbuf *m, uint32_t offset, uint32_t length, struct sockaddr *sa)
 {
 	struct sctp_paramhdr tmp_param, *ph;
 	uint16_t plen, ptype;
 #ifdef INET
 	struct sockaddr_in *sin;
 	struct sctp_ipv4addr_param *a4p;
 	struct sctp_ipv6addr_param addr4_store;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	struct sctp_ipv6addr_param *a6p;
 	struct sctp_ipv6addr_param addr6_store;
 	struct sockaddr_in6 sin6_tmp;
 #endif
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		break;
 #endif
 	default:
 		return (0);
 	}
 
 	SCTPDBG(SCTP_DEBUG_ASCONF2, "find_initack_addr: starting search for ");
 	SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa);
 	/* convert to upper bound */
 	length += offset;
 
 	if ((offset + sizeof(struct sctp_paramhdr)) > length) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "find_initack_addr: invalid offset?\n");
 		return (0);
 	}
 	/* go through the addresses in the init-ack */
 	ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset,
 	    sizeof(struct sctp_paramhdr), (uint8_t *)&tmp_param);
 	while (ph != NULL) {
 		ptype = ntohs(ph->param_type);
 		plen = ntohs(ph->param_length);
 		switch (ptype) {
 #ifdef INET6
 		case SCTP_IPV6_ADDRESS:
 			if (sa->sa_family == AF_INET6) {
 				/* get the entire IPv6 address param */
 				if (plen != sizeof(struct sctp_ipv6addr_param)) {
 					break;
 				}
 				/* get the entire IPv6 address param */
 				a6p = (struct sctp_ipv6addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv6addr_param),
 				    (uint8_t *)&addr6_store);
 				if (a6p == NULL) {
 					return (0);
 				}
 				sin6 = (struct sockaddr_in6 *)sa;
 				if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) {
 					/* create a copy and clear scope */
 					memcpy(&sin6_tmp, sin6,
 					    sizeof(struct sockaddr_in6));
 					sin6 = &sin6_tmp;
 					in6_clearscope(&sin6->sin6_addr);
 				}
 				if (memcmp(&sin6->sin6_addr, a6p->addr,
 				    sizeof(struct in6_addr)) == 0) {
 					/* found it */
 					return (1);
 				}
 			}
 			break;
 #endif				/* INET6 */
 #ifdef INET
 		case SCTP_IPV4_ADDRESS:
 			if (sa->sa_family == AF_INET) {
 				if (plen != sizeof(struct sctp_ipv4addr_param)) {
 					break;
 				}
 				/* get the entire IPv4 address param */
 				a4p = (struct sctp_ipv4addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv4addr_param),
 				    (uint8_t *)&addr4_store);
 				if (a4p == NULL) {
 					return (0);
 				}
 				sin = (struct sockaddr_in *)sa;
 				if (sin->sin_addr.s_addr == a4p->addr) {
 					/* found it */
 					return (1);
 				}
 			}
 			break;
 #endif
 		default:
 			break;
 		}
 		/* get next parameter */
 		offset += SCTP_SIZE32(plen);
 		if (offset + sizeof(struct sctp_paramhdr) > length) {
 			return (0);
 		}
 		ph = (struct sctp_paramhdr *)
 		    sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr),
 		    (uint8_t *)&tmp_param);
 	}			/* while */
 	/* not found! */
 	return (0);
 }
 
 /*
  * makes sure that the current endpoint local addr list is consistent with
  * the new association (eg. subset bound, asconf allowed) adds addresses as
  * necessary
  */
 static void
 sctp_check_address_list_ep(struct sctp_tcb *stcb, struct mbuf *m, int offset,
     int length, struct sockaddr *init_addr)
 {
 	struct sctp_laddr *laddr;
 
 	/* go through the endpoint list */
 	LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
 		/* be paranoid and validate the laddr */
 		if (laddr->ifa == NULL) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1,
 			    "check_addr_list_ep: laddr->ifa is NULL");
 			continue;
 		}
 		if (laddr->ifa == NULL) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "check_addr_list_ep: laddr->ifa->ifa_addr is NULL");
 			continue;
 		}
 		/* do i have it implicitly? */
 		if (sctp_cmpaddr(&laddr->ifa->address.sa, init_addr)) {
 			continue;
 		}
 		/* check to see if in the init-ack */
 		if (!sctp_addr_in_initack(m, offset, length, &laddr->ifa->address.sa)) {
 			/* try to add it */
 			sctp_addr_mgmt_assoc(stcb->sctp_ep, stcb, laddr->ifa,
 			    SCTP_ADD_IP_ADDRESS, SCTP_ADDR_NOT_LOCKED);
 		}
 	}
 }
 
 /*
  * makes sure that the current kernel address list is consistent with the new
  * association (with all addrs bound) adds addresses as necessary
  */
 static void
 sctp_check_address_list_all(struct sctp_tcb *stcb, struct mbuf *m, int offset,
     int length, struct sockaddr *init_addr,
     uint16_t local_scope, uint16_t site_scope,
     uint16_t ipv4_scope, uint16_t loopback_scope)
 {
 	struct sctp_vrf *vrf = NULL;
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa;
 	uint32_t vrf_id;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 
 	if (stcb) {
 		vrf_id = stcb->asoc.vrf_id;
 	} else {
 		return;
 	}
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		SCTP_IPI_ADDR_RUNLOCK();
 		return;
 	}
 	/* go through all our known interfaces */
 	LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 		if (loopback_scope == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 			/* skip loopback interface */
 			continue;
 		}
 		/* go through each interface address */
 		LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 			/* do i have it implicitly? */
 			if (sctp_cmpaddr(&sctp_ifa->address.sa, init_addr)) {
 				continue;
 			}
 			switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				sin = &sctp_ifa->address.sin;
 				if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred,
 				    &sin->sin_addr) != 0) {
 					continue;
 				}
 				if ((ipv4_scope == 0) &&
 				    (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
 					/* private address not in scope */
 					continue;
 				}
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				sin6 = &sctp_ifa->address.sin6;
 				if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred,
 				    &sin6->sin6_addr) != 0) {
 					continue;
 				}
 				if ((local_scope == 0) &&
 				    (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))) {
 					continue;
 				}
 				if ((site_scope == 0) &&
 				    (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
 					continue;
 				}
 				break;
 #endif
 			default:
 				break;
 			}
 			/* check to see if in the init-ack */
 			if (!sctp_addr_in_initack(m, offset, length, &sctp_ifa->address.sa)) {
 				/* try to add it */
 				sctp_addr_mgmt_assoc(stcb->sctp_ep, stcb,
 				    sctp_ifa, SCTP_ADD_IP_ADDRESS,
 				    SCTP_ADDR_LOCKED);
 			}
 		}		/* end foreach ifa */
 	}			/* end foreach ifn */
 	SCTP_IPI_ADDR_RUNLOCK();
 }
 
 /*
  * validates an init-ack chunk (from a cookie-echo) with current addresses
  * adds addresses from the init-ack into our local address list, if needed
  * queues asconf adds/deletes addresses as needed and makes appropriate list
  * changes for source address selection m, offset: points to the start of the
  * address list in an init-ack chunk length: total length of the address
  * params only init_addr: address where my INIT-ACK was sent from
  */
 void
 sctp_check_address_list(struct sctp_tcb *stcb, struct mbuf *m, int offset,
     int length, struct sockaddr *init_addr,
     uint16_t local_scope, uint16_t site_scope,
     uint16_t ipv4_scope, uint16_t loopback_scope)
 {
 	/* process the local addresses in the initack */
 	sctp_process_initack_addresses(stcb, m, offset, length);
 
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/* bound all case */
 		sctp_check_address_list_all(stcb, m, offset, length, init_addr,
 		    local_scope, site_scope, ipv4_scope, loopback_scope);
 	} else {
 		/* subset bound case */
 		if (sctp_is_feature_on(stcb->sctp_ep,
 		    SCTP_PCB_FLAGS_DO_ASCONF)) {
 			/* asconf's allowed */
 			sctp_check_address_list_ep(stcb, m, offset, length,
 			    init_addr);
 		}
 		/* else, no asconfs allowed, so what we sent is what we get */
 	}
 }
 
 /*
  * sctp_bindx() support
  */
 uint32_t
 sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa,
     uint32_t type, uint32_t vrf_id, struct sctp_ifa *sctp_ifap)
 {
 	struct sctp_ifa *ifa;
 	struct sctp_laddr *laddr, *nladdr;
 
 	if (sa->sa_len == 0) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EINVAL);
 		return (EINVAL);
 	}
 	if (sctp_ifap) {
 		ifa = sctp_ifap;
 	} else if (type == SCTP_ADD_IP_ADDRESS) {
 		/* For an add the address MUST be on the system */
 		ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED);
 	} else if (type == SCTP_DEL_IP_ADDRESS) {
 		/* For a delete we need to find it in the inp */
 		ifa = sctp_find_ifa_in_ep(inp, sa, SCTP_ADDR_NOT_LOCKED);
 	} else {
 		ifa = NULL;
 	}
 	if (ifa != NULL) {
 		if (type == SCTP_ADD_IP_ADDRESS) {
 			sctp_add_local_addr_ep(inp, ifa, type);
 		} else if (type == SCTP_DEL_IP_ADDRESS) {
 			if (inp->laddr_count < 2) {
 				/* can't delete the last local address */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EINVAL);
 				return (EINVAL);
 			}
 			LIST_FOREACH(laddr, &inp->sctp_addr_list,
 			    sctp_nxt_addr) {
 				if (ifa == laddr->ifa) {
 					/* Mark in the delete */
 					laddr->action = type;
 				}
 			}
 		}
 		if (LIST_EMPTY(&inp->sctp_asoc_list)) {
 			/*
 			 * There is no need to start the iterator if the inp
 			 * has no associations.
 			 */
 			if (type == SCTP_DEL_IP_ADDRESS) {
 				LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) {
 					if (laddr->ifa == ifa) {
 						sctp_del_local_addr_ep(inp, ifa);
 					}
 				}
 			}
 		} else {
 			struct sctp_asconf_iterator *asc;
 			struct sctp_laddr *wi;
 			int ret;
 
 			SCTP_MALLOC(asc, struct sctp_asconf_iterator *,
 			    sizeof(struct sctp_asconf_iterator),
 			    SCTP_M_ASC_IT);
 			if (asc == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, ENOMEM);
 				return (ENOMEM);
 			}
 			wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 			if (wi == NULL) {
 				SCTP_FREE(asc, SCTP_M_ASC_IT);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, ENOMEM);
 				return (ENOMEM);
 			}
 			LIST_INIT(&asc->list_of_work);
 			asc->cnt = 1;
 			SCTP_INCR_LADDR_COUNT();
 			wi->ifa = ifa;
 			wi->action = type;
 			atomic_add_int(&ifa->refcount, 1);
 			LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr);
 			ret = sctp_initiate_iterator(sctp_asconf_iterator_ep,
 			    sctp_asconf_iterator_stcb,
 			    sctp_asconf_iterator_ep_end,
 			    SCTP_PCB_ANY_FLAGS,
 			    SCTP_PCB_ANY_FEATURES,
 			    SCTP_ASOC_ANY_STATE,
 			    (void *)asc, 0,
 			    sctp_asconf_iterator_end, inp, 0);
 			if (ret) {
 				SCTP_PRINTF("Failed to initiate iterator for addr_mgmt_ep_sa\n");
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EFAULT);
 				sctp_asconf_iterator_end(asc, 0);
 				return (EFAULT);
 			}
 		}
 		return (0);
 	} else {
 		/* invalid address! */
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EADDRNOTAVAIL);
 		return (EADDRNOTAVAIL);
 	}
 }
 
 void
 sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb,
     struct sctp_nets *net)
 {
 	struct sctp_asconf_addr *aa;
 	struct sctp_ifa *sctp_ifap;
 	struct sctp_asconf_tag_param *vtag;
 #ifdef INET
 	struct sockaddr_in *to;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *to6;
 #endif
 	if (net == NULL) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: Missing net\n");
 		return;
 	}
 	if (stcb == NULL) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: Missing stcb\n");
 		return;
 	}
 	/*
 	 * Need to have in the asconf: - vtagparam(my_vtag/peer_vtag) -
 	 * add(0.0.0.0) - del(0.0.0.0) - Any global addresses add(addr)
 	 */
 	SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
 	    SCTP_M_ASC_ADDR);
 	if (aa == NULL) {
 		/* didn't get memory */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "sctp_asconf_send_nat_state_update: failed to get memory!\n");
 		return;
 	}
 	aa->special_del = 0;
 	/* fill in asconf address parameter fields */
 	/* top level elements are "networked" during send */
 	aa->ifa = NULL;
 	aa->sent = 0;		/* clear sent flag */
 	vtag = (struct sctp_asconf_tag_param *)&aa->ap.aph;
 	vtag->aph.ph.param_type = SCTP_NAT_VTAGS;
 	vtag->aph.ph.param_length = sizeof(struct sctp_asconf_tag_param);
 	vtag->local_vtag = htonl(stcb->asoc.my_vtag);
 	vtag->remote_vtag = htonl(stcb->asoc.peer_vtag);
 	TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
 
 	SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
 	    SCTP_M_ASC_ADDR);
 	if (aa == NULL) {
 		/* didn't get memory */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "sctp_asconf_send_nat_state_update: failed to get memory!\n");
 		return;
 	}
 	memset(aa, 0, sizeof(struct sctp_asconf_addr));
 	/* fill in asconf address parameter fields */
 	/* ADD(0.0.0.0) */
 	switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS;
 		aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addrv4_param);
 		aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
 		aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv4addr_param);
 		/* No need to add an address, we are using 0.0.0.0 */
 		TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS;
 		aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addr_param);
 		aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
 		aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv6addr_param);
 		/* No need to add an address, we are using 0.0.0.0 */
 		TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
 		break;
 #endif
 	default:
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "sctp_asconf_send_nat_state_update: unknown address family\n");
 		SCTP_FREE(aa, SCTP_M_ASC_ADDR);
 		return;
 	}
 	SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
 	    SCTP_M_ASC_ADDR);
 	if (aa == NULL) {
 		/* didn't get memory */
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "sctp_asconf_send_nat_state_update: failed to get memory!\n");
 		return;
 	}
 	memset(aa, 0, sizeof(struct sctp_asconf_addr));
 	/* fill in asconf address parameter fields */
 	/* ADD(0.0.0.0) */
 	switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS;
 		aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addrv4_param);
 		aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
 		aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv4addr_param);
 		/* No need to add an address, we are using 0.0.0.0 */
 		TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		aa->ap.aph.ph.param_type = SCTP_DEL_IP_ADDRESS;
 		aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addr_param);
 		aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
 		aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv6addr_param);
 		/* No need to add an address, we are using 0.0.0.0 */
 		TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
 		break;
 #endif
 	default:
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "sctp_asconf_send_nat_state_update: unknown address family\n");
 		SCTP_FREE(aa, SCTP_M_ASC_ADDR);
 		return;
 	}
 	/* Now we must hunt the addresses and add all global addresses */
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		struct sctp_vrf *vrf = NULL;
 		struct sctp_ifn *sctp_ifnp;
 		uint32_t vrf_id;
 
 		vrf_id = stcb->sctp_ep->def_vrf_id;
 		vrf = sctp_find_vrf(vrf_id);
 		if (vrf == NULL) {
 			goto skip_rest;
 		}
 
 		SCTP_IPI_ADDR_RLOCK();
 		LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) {
 			LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) {
 				switch (sctp_ifap->address.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 					to = &sctp_ifap->address.sin;
 					if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred,
 					    &to->sin_addr) != 0) {
 						continue;
 					}
 					if (IN4_ISPRIVATE_ADDRESS(&to->sin_addr)) {
 						continue;
 					}
 					if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) {
 						continue;
 					}
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					to6 = &sctp_ifap->address.sin6;
 					if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred,
 					    &to6->sin6_addr) != 0) {
 						continue;
 					}
 					if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr)) {
 						continue;
 					}
 					if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) {
 						continue;
 					}
 					break;
 #endif
 				default:
 					continue;
 				}
 				sctp_asconf_queue_mgmt(stcb, sctp_ifap, SCTP_ADD_IP_ADDRESS);
 			}
 		}
 		SCTP_IPI_ADDR_RUNLOCK();
 	} else {
 		struct sctp_laddr *laddr;
 
 		LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa == NULL) {
 				continue;
 			}
 			if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED)
 				/*
 				 * Address being deleted by the system, dont
 				 * list.
 				 */
 				continue;
 			if (laddr->action == SCTP_DEL_IP_ADDRESS) {
 				/*
 				 * Address being deleted on this ep don't
 				 * list.
 				 */
 				continue;
 			}
 			sctp_ifap = laddr->ifa;
 			switch (sctp_ifap->address.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				to = &sctp_ifap->address.sin;
 				if (IN4_ISPRIVATE_ADDRESS(&to->sin_addr)) {
 					continue;
 				}
 				if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) {
 					continue;
 				}
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				to6 = &sctp_ifap->address.sin6;
 				if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr)) {
 					continue;
 				}
 				if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) {
 					continue;
 				}
 				break;
 #endif
 			default:
 				continue;
 			}
 			sctp_asconf_queue_mgmt(stcb, sctp_ifap, SCTP_ADD_IP_ADDRESS);
 		}
 	}
 skip_rest:
 	/* Now we must send the asconf into the queue */
 	sctp_send_asconf(stcb, net, SCTP_ADDR_NOT_LOCKED);
 }
Index: projects/clang900-import/sys/sys/elf_common.h
===================================================================
--- projects/clang900-import/sys/sys/elf_common.h	(revision 352586)
+++ projects/clang900-import/sys/sys/elf_common.h	(revision 352587)
@@ -1,1457 +1,1463 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2017, 2018 Dell EMC
  * Copyright (c) 2000, 2001, 2008, 2011, David E. O'Brien
  * Copyright (c) 1998 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_ELF_COMMON_H_
 #define	_SYS_ELF_COMMON_H_ 1
 
 /*
  * ELF definitions that are independent of architecture or word size.
  */
 
 /*
  * Note header.  The ".note" section contains an array of notes.  Each
  * begins with this header, aligned to a word boundary.  Immediately
  * following the note header is n_namesz bytes of name, padded to the
  * next word boundary.  Then comes n_descsz bytes of descriptor, again
  * padded to a word boundary.  The values of n_namesz and n_descsz do
  * not include the padding.
  */
 
 typedef struct {
 	u_int32_t	n_namesz;	/* Length of name. */
 	u_int32_t	n_descsz;	/* Length of descriptor. */
 	u_int32_t	n_type;		/* Type of this note. */
 } Elf_Note;
 typedef Elf_Note Elf_Nhdr;
 
 /*
  * Option kinds.
  */
 #define	ODK_NULL	0	/* undefined */
 #define	ODK_REGINFO	1	/* register usage info */
 #define	ODK_EXCEPTIONS	2	/* exception processing info */
 #define	ODK_PAD		3	/* section padding */
 #define	ODK_HWPATCH	4	/* hardware patch applied */
 #define	ODK_FILL	5	/* fill value used by the linker */
 #define	ODK_TAGS	6	/* reserved space for tools */
 #define	ODK_HWAND	7	/* hardware AND patch applied */
 #define	ODK_HWOR	8	/* hardware OR patch applied */
 #define	ODK_GP_GROUP	9	/* GP group for text/data sections */
 #define	ODK_IDENT	10	/* ID information */
 #define	ODK_PAGESIZE	11	/* page size information */
 
 /*
  * ODK_EXCEPTIONS info field masks.
  */
 #define	OEX_FPU_MIN	0x0000001f	/* min FPU exception required */
 #define	OEX_FPU_MAX	0x00001f00	/* max FPU exception allowed */
 #define	OEX_PAGE0	0x00010000	/* page zero must be mapped */
 #define	OEX_SMM		0x00020000	/* run in sequential memory mode */
 #define	OEX_PRECISEFP	0x00040000	/* run in precise FP exception mode */
 #define	OEX_DISMISS	0x00080000	/* dismiss invalid address traps */
 
 /*
  * ODK_PAD info field masks.
  */
 #define	OPAD_PREFIX	0x0001
 #define	OPAD_POSTFIX	0x0002
 #define	OPAD_SYMBOL	0x0004
 
 /*
  * ODK_HWPATCH info field masks.
  */
 #define	OHW_R4KEOP	0x00000001	/* patch for R4000 branch at end-of-page bug */
 #define	OHW_R8KPFETCH	0x00000002	/* R8000 prefetch bug may occur */
 #define	OHW_R5KEOP	0x00000004	/* patch for R5000 branch at end-of-page bug */
 #define	OHW_R5KCVTL	0x00000008	/* R5000 cvt.[ds].l bug: clean == 1 */
 #define	OHW_R10KLDL	0x00000010UL	/* need patch for R10000 misaligned load */
 
 /*
  * ODK_HWAND/ODK_HWOR info field and hwp_flags[12] masks.
  */
 #define	OHWA0_R4KEOP_CHECKED	0x00000001	/* object checked for R4000 end-of-page bug */
 #define	OHWA0_R4KEOP_CLEAN	0x00000002	/* object verified clean for R4000 end-of-page bug */
 #define	OHWO0_FIXADE		0x00000001	/* object requires call to fixade */
 
 /*
  * ODK_IDENT/ODK_GP_GROUP info field masks.
  */
 #define	OGP_GROUP	0x0000ffff	/* GP group number */
 #define	OGP_SELF	0x00010000	/* GP group is self-contained */
 
 /*
  * The header for GNU-style hash sections.
  */
 
 typedef struct {
 	u_int32_t	gh_nbuckets;	/* Number of hash buckets. */
 	u_int32_t	gh_symndx;	/* First visible symbol in .dynsym. */
 	u_int32_t	gh_maskwords;	/* #maskwords used in bloom filter. */
 	u_int32_t	gh_shift2;	/* Bloom filter shift count. */
 } Elf_GNU_Hash_Header;
 
 /* Indexes into the e_ident array.  Keep synced with
    http://www.sco.com/developers/gabi/latest/ch4.eheader.html */
 #define	EI_MAG0		0	/* Magic number, byte 0. */
 #define	EI_MAG1		1	/* Magic number, byte 1. */
 #define	EI_MAG2		2	/* Magic number, byte 2. */
 #define	EI_MAG3		3	/* Magic number, byte 3. */
 #define	EI_CLASS	4	/* Class of machine. */
 #define	EI_DATA		5	/* Data format. */
 #define	EI_VERSION	6	/* ELF format version. */
 #define	EI_OSABI	7	/* Operating system / ABI identification */
 #define	EI_ABIVERSION	8	/* ABI version */
 #define	OLD_EI_BRAND	8	/* Start of architecture identification. */
 #define	EI_PAD		9	/* Start of padding (per SVR4 ABI). */
 #define	EI_NIDENT	16	/* Size of e_ident array. */
 
 /* Values for the magic number bytes. */
 #define	ELFMAG0		0x7f
 #define	ELFMAG1		'E'
 #define	ELFMAG2		'L'
 #define	ELFMAG3		'F'
 #define	ELFMAG		"\177ELF"	/* magic string */
 #define	SELFMAG		4		/* magic string size */
 
 /* Values for e_ident[EI_VERSION] and e_version. */
 #define	EV_NONE		0
 #define	EV_CURRENT	1
 
 /* Values for e_ident[EI_CLASS]. */
 #define	ELFCLASSNONE	0	/* Unknown class. */
 #define	ELFCLASS32	1	/* 32-bit architecture. */
 #define	ELFCLASS64	2	/* 64-bit architecture. */
 
 /* Values for e_ident[EI_DATA]. */
 #define	ELFDATANONE	0	/* Unknown data format. */
 #define	ELFDATA2LSB	1	/* 2's complement little-endian. */
 #define	ELFDATA2MSB	2	/* 2's complement big-endian. */
 
 /* Values for e_ident[EI_OSABI]. */
 #define	ELFOSABI_NONE		0	/* UNIX System V ABI */
 #define	ELFOSABI_HPUX		1	/* HP-UX operating system */
 #define	ELFOSABI_NETBSD		2	/* NetBSD */
 #define	ELFOSABI_LINUX		3	/* GNU/Linux */
 #define	ELFOSABI_HURD		4	/* GNU/Hurd */
 #define	ELFOSABI_86OPEN		5	/* 86Open common IA32 ABI */
 #define	ELFOSABI_SOLARIS	6	/* Solaris */
 #define	ELFOSABI_AIX		7	/* AIX */
 #define	ELFOSABI_IRIX		8	/* IRIX */
 #define	ELFOSABI_FREEBSD	9	/* FreeBSD */
 #define	ELFOSABI_TRU64		10	/* TRU64 UNIX */
 #define	ELFOSABI_MODESTO	11	/* Novell Modesto */
 #define	ELFOSABI_OPENBSD	12	/* OpenBSD */
 #define	ELFOSABI_OPENVMS	13	/* Open VMS */
 #define	ELFOSABI_NSK		14	/* HP Non-Stop Kernel */
 #define	ELFOSABI_AROS		15	/* Amiga Research OS */
 #define	ELFOSABI_FENIXOS	16	/* FenixOS */
 #define	ELFOSABI_CLOUDABI	17	/* Nuxi CloudABI */
 #define	ELFOSABI_ARM_AEABI	64	/* ARM EABI */
 #define	ELFOSABI_ARM		97	/* ARM */
 #define	ELFOSABI_STANDALONE	255	/* Standalone (embedded) application */
 
 #define	ELFOSABI_SYSV		ELFOSABI_NONE	/* symbol used in old spec */
 #define	ELFOSABI_MONTEREY	ELFOSABI_AIX	/* Monterey */
 #define	ELFOSABI_GNU		ELFOSABI_LINUX
 
 /* e_ident */
 #define	IS_ELF(ehdr)	((ehdr).e_ident[EI_MAG0] == ELFMAG0 && \
 			 (ehdr).e_ident[EI_MAG1] == ELFMAG1 && \
 			 (ehdr).e_ident[EI_MAG2] == ELFMAG2 && \
 			 (ehdr).e_ident[EI_MAG3] == ELFMAG3)
 
 /* Values for e_type. */
 #define	ET_NONE		0	/* Unknown type. */
 #define	ET_REL		1	/* Relocatable. */
 #define	ET_EXEC		2	/* Executable. */
 #define	ET_DYN		3	/* Shared object. */
 #define	ET_CORE		4	/* Core file. */
 #define	ET_LOOS		0xfe00	/* First operating system specific. */
 #define	ET_HIOS		0xfeff	/* Last operating system-specific. */
 #define	ET_LOPROC	0xff00	/* First processor-specific. */
 #define	ET_HIPROC	0xffff	/* Last processor-specific. */
 
 /* Values for e_machine. */
 #define	EM_NONE		0	/* Unknown machine. */
 #define	EM_M32		1	/* AT&T WE32100. */
 #define	EM_SPARC	2	/* Sun SPARC. */
 #define	EM_386		3	/* Intel i386. */
 #define	EM_68K		4	/* Motorola 68000. */
 #define	EM_88K		5	/* Motorola 88000. */
 #define	EM_IAMCU	6	/* Intel MCU. */
 #define	EM_860		7	/* Intel i860. */
 #define	EM_MIPS		8	/* MIPS R3000 Big-Endian only. */
 #define	EM_S370		9	/* IBM System/370. */
 #define	EM_MIPS_RS3_LE	10	/* MIPS R3000 Little-Endian. */
 #define	EM_PARISC	15	/* HP PA-RISC. */
 #define	EM_VPP500	17	/* Fujitsu VPP500. */
 #define	EM_SPARC32PLUS	18	/* SPARC v8plus. */
 #define	EM_960		19	/* Intel 80960. */
 #define	EM_PPC		20	/* PowerPC 32-bit. */
 #define	EM_PPC64	21	/* PowerPC 64-bit. */
 #define	EM_S390		22	/* IBM System/390. */
 #define	EM_V800		36	/* NEC V800. */
 #define	EM_FR20		37	/* Fujitsu FR20. */
 #define	EM_RH32		38	/* TRW RH-32. */
 #define	EM_RCE		39	/* Motorola RCE. */
 #define	EM_ARM		40	/* ARM. */
 #define	EM_SH		42	/* Hitachi SH. */
 #define	EM_SPARCV9	43	/* SPARC v9 64-bit. */
 #define	EM_TRICORE	44	/* Siemens TriCore embedded processor. */
 #define	EM_ARC		45	/* Argonaut RISC Core. */
 #define	EM_H8_300	46	/* Hitachi H8/300. */
 #define	EM_H8_300H	47	/* Hitachi H8/300H. */
 #define	EM_H8S		48	/* Hitachi H8S. */
 #define	EM_H8_500	49	/* Hitachi H8/500. */
 #define	EM_IA_64	50	/* Intel IA-64 Processor. */
 #define	EM_MIPS_X	51	/* Stanford MIPS-X. */
 #define	EM_COLDFIRE	52	/* Motorola ColdFire. */
 #define	EM_68HC12	53	/* Motorola M68HC12. */
 #define	EM_MMA		54	/* Fujitsu MMA. */
 #define	EM_PCP		55	/* Siemens PCP. */
 #define	EM_NCPU		56	/* Sony nCPU. */
 #define	EM_NDR1		57	/* Denso NDR1 microprocessor. */
 #define	EM_STARCORE	58	/* Motorola Star*Core processor. */
 #define	EM_ME16		59	/* Toyota ME16 processor. */
 #define	EM_ST100	60	/* STMicroelectronics ST100 processor. */
 #define	EM_TINYJ	61	/* Advanced Logic Corp. TinyJ processor. */
 #define	EM_X86_64	62	/* Advanced Micro Devices x86-64 */
 #define	EM_AMD64	EM_X86_64	/* Advanced Micro Devices x86-64 (compat) */
 #define	EM_PDSP		63	/* Sony DSP Processor. */
 #define	EM_FX66		66	/* Siemens FX66 microcontroller. */
 #define	EM_ST9PLUS	67	/* STMicroelectronics ST9+ 8/16
 				   microcontroller. */
 #define	EM_ST7		68	/* STmicroelectronics ST7 8-bit
 				   microcontroller. */
 #define	EM_68HC16	69	/* Motorola MC68HC16 microcontroller. */
 #define	EM_68HC11	70	/* Motorola MC68HC11 microcontroller. */
 #define	EM_68HC08	71	/* Motorola MC68HC08 microcontroller. */
 #define	EM_68HC05	72	/* Motorola MC68HC05 microcontroller. */
 #define	EM_SVX		73	/* Silicon Graphics SVx. */
 #define	EM_ST19		74	/* STMicroelectronics ST19 8-bit mc. */
 #define	EM_VAX		75	/* Digital VAX. */
 #define	EM_CRIS		76	/* Axis Communications 32-bit embedded
 				   processor. */
 #define	EM_JAVELIN	77	/* Infineon Technologies 32-bit embedded
 				   processor. */
 #define	EM_FIREPATH	78	/* Element 14 64-bit DSP Processor. */
 #define	EM_ZSP		79	/* LSI Logic 16-bit DSP Processor. */
 #define	EM_MMIX		80	/* Donald Knuth's educational 64-bit proc. */
 #define	EM_HUANY	81	/* Harvard University machine-independent
 				   object files. */
 #define	EM_PRISM	82	/* SiTera Prism. */
 #define	EM_AVR		83	/* Atmel AVR 8-bit microcontroller. */
 #define	EM_FR30		84	/* Fujitsu FR30. */
 #define	EM_D10V		85	/* Mitsubishi D10V. */
 #define	EM_D30V		86	/* Mitsubishi D30V. */
 #define	EM_V850		87	/* NEC v850. */
 #define	EM_M32R		88	/* Mitsubishi M32R. */
 #define	EM_MN10300	89	/* Matsushita MN10300. */
 #define	EM_MN10200	90	/* Matsushita MN10200. */
 #define	EM_PJ		91	/* picoJava. */
 #define	EM_OPENRISC	92	/* OpenRISC 32-bit embedded processor. */
 #define	EM_ARC_A5	93	/* ARC Cores Tangent-A5. */
 #define	EM_XTENSA	94	/* Tensilica Xtensa Architecture. */
 #define	EM_VIDEOCORE	95	/* Alphamosaic VideoCore processor. */
 #define	EM_TMM_GPP	96	/* Thompson Multimedia General Purpose
 				   Processor. */
 #define	EM_NS32K	97	/* National Semiconductor 32000 series. */
 #define	EM_TPC		98	/* Tenor Network TPC processor. */
 #define	EM_SNP1K	99	/* Trebia SNP 1000 processor. */
 #define	EM_ST200	100	/* STMicroelectronics ST200 microcontroller. */
 #define	EM_IP2K		101	/* Ubicom IP2xxx microcontroller family. */
 #define	EM_MAX		102	/* MAX Processor. */
 #define	EM_CR		103	/* National Semiconductor CompactRISC
 				   microprocessor. */
 #define	EM_F2MC16	104	/* Fujitsu F2MC16. */
 #define	EM_MSP430	105	/* Texas Instruments embedded microcontroller
 				   msp430. */
 #define	EM_BLACKFIN	106	/* Analog Devices Blackfin (DSP) processor. */
 #define	EM_SE_C33	107	/* S1C33 Family of Seiko Epson processors. */
 #define	EM_SEP		108	/* Sharp embedded microprocessor. */
 #define	EM_ARCA		109	/* Arca RISC Microprocessor. */
 #define	EM_UNICORE	110	/* Microprocessor series from PKU-Unity Ltd.
 				   and MPRC of Peking University */
 #define	EM_AARCH64	183	/* AArch64 (64-bit ARM) */
 #define	EM_RISCV	243	/* RISC-V */
 
 /* Non-standard or deprecated. */
 #define	EM_486		6	/* Intel i486. */
 #define	EM_MIPS_RS4_BE	10	/* MIPS R4000 Big-Endian */
 #define	EM_ALPHA_STD	41	/* Digital Alpha (standard value). */
 #define	EM_ALPHA	0x9026	/* Alpha (written in the absence of an ABI) */
 
 /**
  * e_flags
  */
 #define	EF_ARM_RELEXEC	0x1
 #define	EF_ARM_HASENTRY	0x2
 #define	EF_ARM_SYMSARESORTED	0x4
 #define	EF_ARM_DYNSYMSUSESEGIDX	0x8
 #define	EF_ARM_MAPSYMSFIRST	0x10
 #define	EF_ARM_LE8		0x00400000
 #define	EF_ARM_BE8		0x00800000
 #define	EF_ARM_EABIMASK		0xFF000000
 #define	EF_ARM_EABI_UNKNOWN	0x00000000
 #define	EF_ARM_EABI_VER1	0x01000000
 #define	EF_ARM_EABI_VER2	0x02000000
 #define	EF_ARM_EABI_VER3	0x03000000
 #define	EF_ARM_EABI_VER4	0x04000000
 #define	EF_ARM_EABI_VER5	0x05000000
 #define	EF_ARM_INTERWORK	0x00000004
 #define	EF_ARM_APCS_26		0x00000008
 #define	EF_ARM_APCS_FLOAT	0x00000010
 #define	EF_ARM_PIC		0x00000020
 #define	EF_ARM_ALIGN8		0x00000040
 #define	EF_ARM_NEW_ABI		0x00000080
 #define	EF_ARM_OLD_ABI		0x00000100
 #define	EF_ARM_ABI_FLOAT_SOFT	0x00000200
 #define	EF_ARM_SOFT_FLOAT	EF_ARM_ABI_FLOAT_SOFT /* Pre-V5 ABI name */
 #define	EF_ARM_ABI_FLOAT_HARD	0x00000400
 #define	EF_ARM_VFP_FLOAT	EF_ARM_ABI_FLOAT_HARD /* Pre-V5 ABI name */
 #define	EF_ARM_MAVERICK_FLOAT	0x00000800
 
 #define	EF_MIPS_NOREORDER	0x00000001
 #define	EF_MIPS_PIC		0x00000002	/* Contains PIC code */
 #define	EF_MIPS_CPIC		0x00000004	/* STD PIC calling sequence */
 #define	EF_MIPS_UCODE		0x00000010
 #define	EF_MIPS_ABI2		0x00000020	/* N32 */
 #define	EF_MIPS_OPTIONS_FIRST	0x00000080
 #define	EF_MIPS_ABI		0x0000F000
 #define	EF_MIPS_ABI_O32		0x00001000
 #define	EF_MIPS_ABI_O64		0x00002000
 #define	EF_MIPS_ABI_EABI32	0x00003000
 #define	EF_MIPS_ABI_EABI64	0x00004000
 #define	EF_MIPS_ARCH_ASE	0x0F000000	/* Architectural extensions */
 #define	EF_MIPS_ARCH_ASE_MDMX	0x08000000	/* MDMX multimedia extension */
 #define	EF_MIPS_ARCH_ASE_M16	0x04000000	/* MIPS-16 ISA extensions */
 #define	EF_MIPS_ARCH		0xF0000000	/* Architecture field */
 #define	EF_MIPS_ARCH_1		0x00000000	/* -mips1 code */
 #define	EF_MIPS_ARCH_2		0x10000000	/* -mips2 code */
 #define	EF_MIPS_ARCH_3		0x20000000	/* -mips3 code */
 #define	EF_MIPS_ARCH_4		0x30000000	/* -mips4 code */
 #define	EF_MIPS_ARCH_5		0x40000000	/* -mips5 code */
 #define	EF_MIPS_ARCH_32		0x50000000	/* -mips32 code */
 #define	EF_MIPS_ARCH_64		0x60000000	/* -mips64 code */
 #define	EF_MIPS_ARCH_32R2	0x70000000	/* -mips32r2 code */
 #define	EF_MIPS_ARCH_64R2	0x80000000	/* -mips64r2 code */
 
 #define	EF_PPC_EMB		0x80000000
 #define	EF_PPC_RELOCATABLE	0x00010000
 #define	EF_PPC_RELOCATABLE_LIB	0x00008000
 
 #define	EF_RISCV_RVC		0x00000001
 #define	EF_RISCV_FLOAT_ABI_MASK	0x00000006
 #define	EF_RISCV_FLOAT_ABI_SOFT	0x00000000
 #define	EF_RISCV_FLOAT_ABI_SINGLE 0x000002
 #define	EF_RISCV_FLOAT_ABI_DOUBLE 0x000004
 #define	EF_RISCV_FLOAT_ABI_QUAD	0x00000006
 #define	EF_RISCV_RVE		0x00000008
 #define	EF_RISCV_TSO		0x00000010
 
 #define	EF_SPARC_EXT_MASK	0x00ffff00
 #define	EF_SPARC_32PLUS		0x00000100
 #define	EF_SPARC_SUN_US1	0x00000200
 #define	EF_SPARC_HAL_R1		0x00000200
 #define	EF_SPARC_SUN_US3	0x00000800
 
 #define	EF_SPARCV9_MM		0x00000003
 #define	EF_SPARCV9_TSO		0x00000000
 #define	EF_SPARCV9_PSO		0x00000001
 #define	EF_SPARCV9_RMO		0x00000002
 
 /* Special section indexes. */
 #define	SHN_UNDEF	     0		/* Undefined, missing, irrelevant. */
 #define	SHN_LORESERVE	0xff00		/* First of reserved range. */
 #define	SHN_LOPROC	0xff00		/* First processor-specific. */
 #define	SHN_HIPROC	0xff1f		/* Last processor-specific. */
 #define	SHN_LOOS	0xff20		/* First operating system-specific. */
 #define	SHN_FBSD_CACHED	SHN_LOOS	/* Transient, for sys/kern/link_elf_obj
 					   linker only: Cached global in local
 					   symtab. */
 #define	SHN_HIOS	0xff3f		/* Last operating system-specific. */
 #define	SHN_ABS		0xfff1		/* Absolute values. */
 #define	SHN_COMMON	0xfff2		/* Common data. */
 #define	SHN_XINDEX	0xffff		/* Escape -- index stored elsewhere. */
 #define	SHN_HIRESERVE	0xffff		/* Last of reserved range. */
 
 /* sh_type */
 #define	SHT_NULL		0	/* inactive */
 #define	SHT_PROGBITS		1	/* program defined information */
 #define	SHT_SYMTAB		2	/* symbol table section */
 #define	SHT_STRTAB		3	/* string table section */
 #define	SHT_RELA		4	/* relocation section with addends */
 #define	SHT_HASH		5	/* symbol hash table section */
 #define	SHT_DYNAMIC		6	/* dynamic section */
 #define	SHT_NOTE		7	/* note section */
 #define	SHT_NOBITS		8	/* no space section */
 #define	SHT_REL			9	/* relocation section - no addends */
 #define	SHT_SHLIB		10	/* reserved - purpose unknown */
 #define	SHT_DYNSYM		11	/* dynamic symbol table section */
 #define	SHT_INIT_ARRAY		14	/* Initialization function pointers. */
 #define	SHT_FINI_ARRAY		15	/* Termination function pointers. */
 #define	SHT_PREINIT_ARRAY	16	/* Pre-initialization function ptrs. */
 #define	SHT_GROUP		17	/* Section group. */
 #define	SHT_SYMTAB_SHNDX	18	/* Section indexes (see SHN_XINDEX). */
 #define	SHT_LOOS		0x60000000	/* First of OS specific semantics */
 #define	SHT_LOSUNW		0x6ffffff4
 #define	SHT_SUNW_dof		0x6ffffff4
 #define	SHT_SUNW_cap		0x6ffffff5
 #define	SHT_GNU_ATTRIBUTES	0x6ffffff5
 #define	SHT_SUNW_SIGNATURE	0x6ffffff6
 #define	SHT_GNU_HASH		0x6ffffff6
 #define	SHT_GNU_LIBLIST		0x6ffffff7
 #define	SHT_SUNW_ANNOTATE	0x6ffffff7
 #define	SHT_SUNW_DEBUGSTR	0x6ffffff8
 #define	SHT_SUNW_DEBUG		0x6ffffff9
 #define	SHT_SUNW_move		0x6ffffffa
 #define	SHT_SUNW_COMDAT		0x6ffffffb
 #define	SHT_SUNW_syminfo	0x6ffffffc
 #define	SHT_SUNW_verdef		0x6ffffffd
 #define	SHT_GNU_verdef		0x6ffffffd	/* Symbol versions provided */
 #define	SHT_SUNW_verneed	0x6ffffffe
 #define	SHT_GNU_verneed		0x6ffffffe	/* Symbol versions required */
 #define	SHT_SUNW_versym		0x6fffffff
 #define	SHT_GNU_versym		0x6fffffff	/* Symbol version table */
 #define	SHT_HISUNW		0x6fffffff
 #define	SHT_HIOS		0x6fffffff	/* Last of OS specific semantics */
 #define	SHT_LOPROC		0x70000000	/* reserved range for processor */
 #define	SHT_X86_64_UNWIND	0x70000001	/* unwind information */
 #define	SHT_AMD64_UNWIND	SHT_X86_64_UNWIND 
 
 #define	SHT_ARM_EXIDX		0x70000001	/* Exception index table. */
 #define	SHT_ARM_PREEMPTMAP	0x70000002	/* BPABI DLL dynamic linking 
 						   pre-emption map. */
 #define	SHT_ARM_ATTRIBUTES	0x70000003	/* Object file compatibility 
 						   attributes. */
 #define	SHT_ARM_DEBUGOVERLAY	0x70000004	/* See DBGOVL for details. */
 #define	SHT_ARM_OVERLAYSECTION	0x70000005	/* See DBGOVL for details. */
 #define	SHT_MIPS_LIBLIST	0x70000000
 #define	SHT_MIPS_MSYM		0x70000001
 #define	SHT_MIPS_CONFLICT	0x70000002
 #define	SHT_MIPS_GPTAB		0x70000003
 #define	SHT_MIPS_UCODE		0x70000004
 #define	SHT_MIPS_DEBUG		0x70000005
 #define	SHT_MIPS_REGINFO	0x70000006
 #define	SHT_MIPS_PACKAGE	0x70000007
 #define	SHT_MIPS_PACKSYM	0x70000008
 #define	SHT_MIPS_RELD		0x70000009
 #define	SHT_MIPS_IFACE		0x7000000b
 #define	SHT_MIPS_CONTENT	0x7000000c
 #define	SHT_MIPS_OPTIONS	0x7000000d
 #define	SHT_MIPS_DELTASYM	0x7000001b
 #define	SHT_MIPS_DELTAINST	0x7000001c
 #define	SHT_MIPS_DELTACLASS	0x7000001d
 #define	SHT_MIPS_DWARF		0x7000001e	/* MIPS gcc uses MIPS_DWARF */
 #define	SHT_MIPS_DELTADECL	0x7000001f
 #define	SHT_MIPS_SYMBOL_LIB	0x70000020
 #define	SHT_MIPS_EVENTS		0x70000021
 #define	SHT_MIPS_TRANSLATE	0x70000022
 #define	SHT_MIPS_PIXIE		0x70000023
 #define	SHT_MIPS_XLATE		0x70000024
 #define	SHT_MIPS_XLATE_DEBUG	0x70000025
 #define	SHT_MIPS_WHIRL		0x70000026
 #define	SHT_MIPS_EH_REGION	0x70000027
 #define	SHT_MIPS_XLATE_OLD	0x70000028
 #define	SHT_MIPS_PDR_EXCEPTION	0x70000029
 #define	SHT_MIPS_ABIFLAGS	0x7000002a
 
 #define	SHT_SPARC_GOTDATA	0x70000000
 
 #define	SHTORDERED
 #define	SHT_HIPROC		0x7fffffff	/* specific section header types */
 #define	SHT_LOUSER		0x80000000	/* reserved range for application */
 #define	SHT_HIUSER		0xffffffff	/* specific indexes */
 
 /* Flags for sh_flags. */
 #define	SHF_WRITE		0x1	/* Section contains writable data. */
 #define	SHF_ALLOC		0x2	/* Section occupies memory. */
 #define	SHF_EXECINSTR		0x4	/* Section contains instructions. */
 #define	SHF_MERGE		0x10	/* Section may be merged. */
 #define	SHF_STRINGS		0x20	/* Section contains strings. */
 #define	SHF_INFO_LINK		0x40	/* sh_info holds section index. */
 #define	SHF_LINK_ORDER		0x80	/* Special ordering requirements. */
 #define	SHF_OS_NONCONFORMING	0x100	/* OS-specific processing required. */
 #define	SHF_GROUP		0x200	/* Member of section group. */
 #define	SHF_TLS			0x400	/* Section contains TLS data. */
 #define	SHF_COMPRESSED		0x800	/* Section contains compressed data. */
 #define	SHF_MASKOS	0x0ff00000	/* OS-specific semantics. */
 #define	SHF_MASKPROC	0xf0000000	/* Processor-specific semantics. */
 
 /* Flags for section groups. */
 #define	GRP_COMDAT	0x1	/* COMDAT semantics. */
 
 /*
  * Flags / mask for .gnu.versym sections.
  */
 #define	VERSYM_VERSION	0x7fff
 #define	VERSYM_HIDDEN	0x8000
 
 /* Values for p_type. */
 #define	PT_NULL		0	/* Unused entry. */
 #define	PT_LOAD		1	/* Loadable segment. */
 #define	PT_DYNAMIC	2	/* Dynamic linking information segment. */
 #define	PT_INTERP	3	/* Pathname of interpreter. */
 #define	PT_NOTE		4	/* Auxiliary information. */
 #define	PT_SHLIB	5	/* Reserved (not used). */
 #define	PT_PHDR		6	/* Location of program header itself. */
 #define	PT_TLS		7	/* Thread local storage segment */
 #define	PT_LOOS		0x60000000	/* First OS-specific. */
 #define	PT_SUNW_UNWIND	0x6464e550	/* amd64 UNWIND program header */
 #define	PT_GNU_EH_FRAME	0x6474e550
 #define	PT_GNU_STACK	0x6474e551
 #define	PT_GNU_RELRO	0x6474e552
 #define	PT_DUMP_DELTA	0x6fb5d000	/* va->pa map for kernel dumps
 					   (currently arm). */
 #define	PT_LOSUNW	0x6ffffffa
 #define	PT_SUNWBSS	0x6ffffffa	/* Sun Specific segment */
 #define	PT_SUNWSTACK	0x6ffffffb	/* describes the stack segment */
 #define	PT_SUNWDTRACE	0x6ffffffc	/* private */
 #define	PT_SUNWCAP	0x6ffffffd	/* hard/soft capabilities segment */
 #define	PT_HISUNW	0x6fffffff
 #define	PT_HIOS		0x6fffffff	/* Last OS-specific. */
 #define	PT_LOPROC	0x70000000	/* First processor-specific type. */
 #define	PT_ARM_ARCHEXT	0x70000000	/* ARM arch compat information. */
 #define	PT_ARM_EXIDX	0x70000001	/* ARM exception unwind tables. */
 #define	PT_HIPROC	0x7fffffff	/* Last processor-specific type. */
 
 #define	PT_OPENBSD_RANDOMIZE	0x65A3DBE6	/* OpenBSD random data segment */
 #define	PT_OPENBSD_WXNEEDED	0x65A3DBE7	/* OpenBSD EXEC/WRITE pages needed */
 #define	PT_OPENBSD_BOOTDATA	0x65A41BE6	/* OpenBSD section for boot args */
 
 /* Values for p_flags. */
 #define	PF_X		0x1		/* Executable. */
 #define	PF_W		0x2		/* Writable. */
 #define	PF_R		0x4		/* Readable. */
 #define	PF_MASKOS	0x0ff00000	/* Operating system-specific. */
 #define	PF_MASKPROC	0xf0000000	/* Processor-specific. */
 
 /* Extended program header index. */
 #define	PN_XNUM		0xffff
 
 /* Values for d_tag. */
 #define	DT_NULL		0	/* Terminating entry. */
 #define	DT_NEEDED	1	/* String table offset of a needed shared
 				   library. */
 #define	DT_PLTRELSZ	2	/* Total size in bytes of PLT relocations. */
 #define	DT_PLTGOT	3	/* Processor-dependent address. */
 #define	DT_HASH		4	/* Address of symbol hash table. */
 #define	DT_STRTAB	5	/* Address of string table. */
 #define	DT_SYMTAB	6	/* Address of symbol table. */
 #define	DT_RELA		7	/* Address of ElfNN_Rela relocations. */
 #define	DT_RELASZ	8	/* Total size of ElfNN_Rela relocations. */
 #define	DT_RELAENT	9	/* Size of each ElfNN_Rela relocation entry. */
 #define	DT_STRSZ	10	/* Size of string table. */
 #define	DT_SYMENT	11	/* Size of each symbol table entry. */
 #define	DT_INIT		12	/* Address of initialization function. */
 #define	DT_FINI		13	/* Address of finalization function. */
 #define	DT_SONAME	14	/* String table offset of shared object
 				   name. */
 #define	DT_RPATH	15	/* String table offset of library path. [sup] */
 #define	DT_SYMBOLIC	16	/* Indicates "symbolic" linking. [sup] */
 #define	DT_REL		17	/* Address of ElfNN_Rel relocations. */
 #define	DT_RELSZ	18	/* Total size of ElfNN_Rel relocations. */
 #define	DT_RELENT	19	/* Size of each ElfNN_Rel relocation. */
 #define	DT_PLTREL	20	/* Type of relocation used for PLT. */
 #define	DT_DEBUG	21	/* Reserved (not used). */
 #define	DT_TEXTREL	22	/* Indicates there may be relocations in
 				   non-writable segments. [sup] */
 #define	DT_JMPREL	23	/* Address of PLT relocations. */
 #define	DT_BIND_NOW	24	/* [sup] */
 #define	DT_INIT_ARRAY	25	/* Address of the array of pointers to
 				   initialization functions */
 #define	DT_FINI_ARRAY	26	/* Address of the array of pointers to
 				   termination functions */
 #define	DT_INIT_ARRAYSZ	27	/* Size in bytes of the array of
 				   initialization functions. */
 #define	DT_FINI_ARRAYSZ	28	/* Size in bytes of the array of
 				   termination functions. */
 #define	DT_RUNPATH	29	/* String table offset of a null-terminated
 				   library search path string. */
 #define	DT_FLAGS	30	/* Object specific flag values. */
 #define	DT_ENCODING	32	/* Values greater than or equal to DT_ENCODING
 				   and less than DT_LOOS follow the rules for
 				   the interpretation of the d_un union
 				   as follows: even == 'd_ptr', odd == 'd_val'
 				   or none */
 #define	DT_PREINIT_ARRAY 32	/* Address of the array of pointers to
 				   pre-initialization functions. */
 #define	DT_PREINIT_ARRAYSZ 33	/* Size in bytes of the array of
 				   pre-initialization functions. */
 #define	DT_MAXPOSTAGS	34	/* number of positive tags */
 #define	DT_LOOS		0x6000000d	/* First OS-specific */
 #define	DT_SUNW_AUXILIARY	0x6000000d	/* symbol auxiliary name */
 #define	DT_SUNW_RTLDINF		0x6000000e	/* ld.so.1 info (private) */
 #define	DT_SUNW_FILTER		0x6000000f	/* symbol filter name */
 #define	DT_SUNW_CAP		0x60000010	/* hardware/software */
 #define	DT_SUNW_ASLR		0x60000023	/* ASLR control */
 #define	DT_HIOS		0x6ffff000	/* Last OS-specific */
 
 /*
  * DT_* entries which fall between DT_VALRNGHI & DT_VALRNGLO use the
  * Dyn.d_un.d_val field of the Elf*_Dyn structure.
  */
 #define	DT_VALRNGLO	0x6ffffd00
 #define	DT_GNU_PRELINKED	0x6ffffdf5 /* prelinking timestamp */
 #define	DT_GNU_CONFLICTSZ	0x6ffffdf6 /* size of conflict section */
 #define	DT_GNU_LIBLISTSZ	0x6ffffdf7 /* size of library list */
 #define	DT_CHECKSUM	0x6ffffdf8	/* elf checksum */
 #define	DT_PLTPADSZ	0x6ffffdf9	/* pltpadding size */
 #define	DT_MOVEENT	0x6ffffdfa	/* move table entry size */
 #define	DT_MOVESZ	0x6ffffdfb	/* move table size */
 #define	DT_FEATURE	0x6ffffdfc	/* feature holder */
 #define	DT_FEATURE_1	DT_FEATURE
 #define	DT_POSFLAG_1	0x6ffffdfd	/* flags for DT_* entries, effecting */
 					/*	the following DT_* entry. */
 					/*	See DF_P1_* definitions */
 #define	DT_SYMINSZ	0x6ffffdfe	/* syminfo table size (in bytes) */
 #define	DT_SYMINENT	0x6ffffdff	/* syminfo entry size (in bytes) */
 #define	DT_VALRNGHI	0x6ffffdff
 
 /*
  * DT_* entries which fall between DT_ADDRRNGHI & DT_ADDRRNGLO use the
  * Dyn.d_un.d_ptr field of the Elf*_Dyn structure.
  *
  * If any adjustment is made to the ELF object after it has been
  * built, these entries will need to be adjusted.
  */
 #define	DT_ADDRRNGLO	0x6ffffe00
 #define	DT_GNU_HASH	0x6ffffef5	/* GNU-style hash table */
 #define	DT_TLSDESC_PLT	0x6ffffef6	/* loc. of PLT for tlsdesc resolver */
 #define	DT_TLSDESC_GOT	0x6ffffef7	/* loc. of GOT for tlsdesc resolver */
 #define	DT_GNU_CONFLICT	0x6ffffef8	/* address of conflict section */
 #define	DT_GNU_LIBLIST	0x6ffffef9	/* address of library list */
 #define	DT_CONFIG	0x6ffffefa	/* configuration information */
 #define	DT_DEPAUDIT	0x6ffffefb	/* dependency auditing */
 #define	DT_AUDIT	0x6ffffefc	/* object auditing */
 #define	DT_PLTPAD	0x6ffffefd	/* pltpadding (sparcv9) */
 #define	DT_MOVETAB	0x6ffffefe	/* move table */
 #define	DT_SYMINFO	0x6ffffeff	/* syminfo table */
 #define	DT_ADDRRNGHI	0x6ffffeff
 
 #define	DT_VERSYM	0x6ffffff0	/* Address of versym section. */
 #define	DT_RELACOUNT	0x6ffffff9	/* number of RELATIVE relocations */
 #define	DT_RELCOUNT	0x6ffffffa	/* number of RELATIVE relocations */
 #define	DT_FLAGS_1	0x6ffffffb	/* state flags - see DF_1_* defs */
 #define	DT_VERDEF	0x6ffffffc	/* Address of verdef section. */
 #define	DT_VERDEFNUM	0x6ffffffd	/* Number of elems in verdef section */
 #define	DT_VERNEED	0x6ffffffe	/* Address of verneed section. */
 #define	DT_VERNEEDNUM	0x6fffffff	/* Number of elems in verneed section */
 
 #define	DT_LOPROC	0x70000000	/* First processor-specific type. */
 
 #define	DT_ARM_SYMTABSZ			0x70000001
 #define	DT_ARM_PREEMPTMAP		0x70000002
 
 #define	DT_SPARC_REGISTER		0x70000001
 #define	DT_DEPRECATED_SPARC_REGISTER	0x7000001
 
 #define	DT_MIPS_RLD_VERSION		0x70000001
 #define	DT_MIPS_TIME_STAMP		0x70000002
 #define	DT_MIPS_ICHECKSUM		0x70000003
 #define	DT_MIPS_IVERSION		0x70000004
 #define	DT_MIPS_FLAGS			0x70000005
 #define	DT_MIPS_BASE_ADDRESS		0x70000006
 #define	DT_MIPS_CONFLICT		0x70000008
 #define	DT_MIPS_LIBLIST			0x70000009
 #define	DT_MIPS_LOCAL_GOTNO		0x7000000a
 #define	DT_MIPS_CONFLICTNO		0x7000000b
 #define	DT_MIPS_LIBLISTNO		0x70000010
 #define	DT_MIPS_SYMTABNO		0x70000011
 #define	DT_MIPS_UNREFEXTNO		0x70000012
 #define	DT_MIPS_GOTSYM			0x70000013
 #define	DT_MIPS_HIPAGENO		0x70000014
 #define	DT_MIPS_RLD_MAP			0x70000016
 #define	DT_MIPS_DELTA_CLASS		0x70000017
 #define	DT_MIPS_DELTA_CLASS_NO		0x70000018
 #define	DT_MIPS_DELTA_INSTANCE		0x70000019
 #define	DT_MIPS_DELTA_INSTANCE_NO	0x7000001A
 #define	DT_MIPS_DELTA_RELOC		0x7000001B
 #define	DT_MIPS_DELTA_RELOC_NO		0x7000001C
 #define	DT_MIPS_DELTA_SYM		0x7000001D
 #define	DT_MIPS_DELTA_SYM_NO		0x7000001E
 #define	DT_MIPS_DELTA_CLASSSYM		0x70000020
 #define	DT_MIPS_DELTA_CLASSSYM_NO	0x70000021
 #define	DT_MIPS_CXX_FLAGS		0x70000022
 #define	DT_MIPS_PIXIE_INIT		0x70000023
 #define	DT_MIPS_SYMBOL_LIB		0x70000024
 #define	DT_MIPS_LOCALPAGE_GOTIDX	0x70000025
 #define	DT_MIPS_LOCAL_GOTIDX		0x70000026
 #define	DT_MIPS_HIDDEN_GOTIDX		0x70000027
 #define	DT_MIPS_PROTECTED_GOTIDX	0x70000028
 #define	DT_MIPS_OPTIONS			0x70000029
 #define	DT_MIPS_INTERFACE		0x7000002A
 #define	DT_MIPS_DYNSTR_ALIGN		0x7000002B
 #define	DT_MIPS_INTERFACE_SIZE		0x7000002C
 #define	DT_MIPS_RLD_TEXT_RESOLVE_ADDR	0x7000002D
 #define	DT_MIPS_PERF_SUFFIX		0x7000002E
 #define	DT_MIPS_COMPACT_SIZE		0x7000002F
 #define	DT_MIPS_GP_VALUE		0x70000030
 #define	DT_MIPS_AUX_DYNAMIC		0x70000031
 #define	DT_MIPS_PLTGOT			0x70000032
 #define	DT_MIPS_RLD_OBJ_UPDATE		0x70000033
 #define	DT_MIPS_RWPLT			0x70000034
 #define	DT_MIPS_RLD_MAP_REL		0x70000035
 
 #define	DT_PPC_GOT			0x70000000
 #define	DT_PPC_TLSOPT			0x70000001
 
 #define	DT_PPC64_GLINK			0x70000000
 #define	DT_PPC64_OPD			0x70000001
 #define	DT_PPC64_OPDSZ			0x70000002
 #define	DT_PPC64_TLSOPT			0x70000003
 
 #define	DT_AUXILIARY	0x7ffffffd	/* shared library auxiliary name */
 #define	DT_USED		0x7ffffffe	/* ignored - same as needed */
 #define	DT_FILTER	0x7fffffff	/* shared library filter name */
 #define	DT_HIPROC	0x7fffffff	/* Last processor-specific type. */
 
 /* Values for DT_FLAGS */
 #define	DF_ORIGIN	0x0001	/* Indicates that the object being loaded may
 				   make reference to the $ORIGIN substitution
 				   string */
 #define	DF_SYMBOLIC	0x0002	/* Indicates "symbolic" linking. */
 #define	DF_TEXTREL	0x0004	/* Indicates there may be relocations in
 				   non-writable segments. */
 #define	DF_BIND_NOW	0x0008	/* Indicates that the dynamic linker should
 				   process all relocations for the object
 				   containing this entry before transferring
 				   control to the program. */
 #define	DF_STATIC_TLS	0x0010	/* Indicates that the shared object or
 				   executable contains code using a static
 				   thread-local storage scheme. */
 
 /* Values for DT_FLAGS_1 */
 #define	DF_1_BIND_NOW	0x00000001	/* Same as DF_BIND_NOW */
 #define	DF_1_GLOBAL	0x00000002	/* Set the RTLD_GLOBAL for object */
 #define	DF_1_NODELETE	0x00000008	/* Set the RTLD_NODELETE for object */
 #define	DF_1_LOADFLTR	0x00000010	/* Immediate loading of filtees */
 #define	DF_1_NOOPEN     0x00000040	/* Do not allow loading on dlopen() */
 #define	DF_1_ORIGIN	0x00000080	/* Process $ORIGIN */
 #define	DF_1_INTERPOSE	0x00000400	/* Interpose all objects but main */
 #define	DF_1_NODEFLIB	0x00000800	/* Do not search default paths */
 
 /* Values for l_flags. */
 #define	LL_NONE			0x0	/* no flags */
 #define	LL_EXACT_MATCH		0x1	/* require an exact match */
 #define	LL_IGNORE_INT_VER	0x2	/* ignore version incompatibilities */
 #define	LL_REQUIRE_MINOR	0x4
 #define	LL_EXPORTS		0x8
 #define	LL_DELAY_LOAD		0x10
 #define	LL_DELTA		0x20
 
+/* Note section names */
+#define	ELF_NOTE_FREEBSD	"FreeBSD"
+#define	ELF_NOTE_NETBSD		"NetBSD"
+#define	ELF_NOTE_SOLARIS	"SUNW Solaris"
+#define	ELF_NOTE_GNU		"GNU"
+
 /* Values for n_type used in executables. */
 #define	NT_FREEBSD_ABI_TAG	1
 #define	NT_FREEBSD_NOINIT_TAG	2
 #define	NT_FREEBSD_ARCH_TAG	3
 #define	NT_FREEBSD_FEATURE_CTL	4
 
 /* NT_FREEBSD_FEATURE_CTL desc[0] bits */
 #define	NT_FREEBSD_FCTL_ASLR_DISABLE	0x00000001
 #define	NT_FREEBSD_FCTL_PROTMAX_DISABLE	0x00000002
 
 /* Values for n_type.  Used in core files. */
 #define	NT_PRSTATUS	1	/* Process status. */
 #define	NT_FPREGSET	2	/* Floating point registers. */
 #define	NT_PRPSINFO	3	/* Process state info. */
 #define	NT_THRMISC	7	/* Thread miscellaneous info. */
 #define	NT_PROCSTAT_PROC	8	/* Procstat proc data. */
 #define	NT_PROCSTAT_FILES	9	/* Procstat files data. */
 #define	NT_PROCSTAT_VMMAP	10	/* Procstat vmmap data. */
 #define	NT_PROCSTAT_GROUPS	11	/* Procstat groups data. */
 #define	NT_PROCSTAT_UMASK	12	/* Procstat umask data. */
 #define	NT_PROCSTAT_RLIMIT	13	/* Procstat rlimit data. */
 #define	NT_PROCSTAT_OSREL	14	/* Procstat osreldate data. */
 #define	NT_PROCSTAT_PSSTRINGS	15	/* Procstat ps_strings data. */
 #define	NT_PROCSTAT_AUXV	16	/* Procstat auxv data. */
 #define	NT_PTLWPINFO		17	/* Thread ptrace miscellaneous info. */
 #define	NT_PPC_VMX	0x100	/* PowerPC Altivec/VMX registers */
 #define	NT_PPC_VSX	0x102	/* PowerPC VSX registers */
 #define	NT_X86_XSTATE	0x202	/* x86 XSAVE extended state. */
 #define	NT_ARM_VFP	0x400	/* ARM VFP registers */
 
 /* GNU note types. */
 #define	NT_GNU_ABI_TAG		1
 #define	NT_GNU_HWCAP		2
 #define	NT_GNU_BUILD_ID		3
 #define	NT_GNU_GOLD_VERSION	4
 #define	NT_GNU_PROPERTY_TYPE_0	5
 
 #define	GNU_PROPERTY_LOPROC			0xc0000000
 #define	GNU_PROPERTY_HIPROC			0xdfffffff
 
 #define	GNU_PROPERTY_X86_FEATURE_1_AND		0xc0000002
 
 #define	GNU_PROPERTY_X86_FEATURE_1_IBT		0x00000001
 #define	GNU_PROPERTY_X86_FEATURE_1_SHSTK	0x00000002
 
 /* Symbol Binding - ELFNN_ST_BIND - st_info */
 #define	STB_LOCAL	0	/* Local symbol */
 #define	STB_GLOBAL	1	/* Global symbol */
 #define	STB_WEAK	2	/* like global - lower precedence */
 #define	STB_LOOS	10	/* Start of operating system reserved range. */
 #define	STB_GNU_UNIQUE	10	/* Unique symbol (GNU) */
 #define	STB_HIOS	12	/* End of operating system reserved range. */
 #define	STB_LOPROC	13	/* reserved range for processor */
 #define	STB_HIPROC	15	/*   specific semantics. */
 
 /* Symbol type - ELFNN_ST_TYPE - st_info */
 #define	STT_NOTYPE	0	/* Unspecified type. */
 #define	STT_OBJECT	1	/* Data object. */
 #define	STT_FUNC	2	/* Function. */
 #define	STT_SECTION	3	/* Section. */
 #define	STT_FILE	4	/* Source file. */
 #define	STT_COMMON	5	/* Uninitialized common block. */
 #define	STT_TLS		6	/* TLS object. */
 #define	STT_NUM		7
 #define	STT_LOOS	10	/* Reserved range for operating system */
 #define	STT_GNU_IFUNC	10
 #define	STT_HIOS	12	/*   specific semantics. */
 #define	STT_LOPROC	13	/* Start of processor reserved range. */
 #define	STT_SPARC_REGISTER 13	/* SPARC register information. */
 #define	STT_HIPROC	15	/* End of processor reserved range. */
 
 /* Symbol visibility - ELFNN_ST_VISIBILITY - st_other */
 #define	STV_DEFAULT	0x0	/* Default visibility (see binding). */
 #define	STV_INTERNAL	0x1	/* Special meaning in relocatable objects. */
 #define	STV_HIDDEN	0x2	/* Not visible. */
 #define	STV_PROTECTED	0x3	/* Visible but not preemptible. */
 #define	STV_EXPORTED	0x4
 #define	STV_SINGLETON	0x5
 #define	STV_ELIMINATE	0x6
 
 /* Special symbol table indexes. */
 #define	STN_UNDEF	0	/* Undefined symbol index. */
 
 /* Symbol versioning flags. */
 #define	VER_DEF_CURRENT	1
 #define	VER_DEF_IDX(x)	VER_NDX(x)
 
 #define	VER_FLG_BASE	0x01
 #define	VER_FLG_WEAK	0x02
 
 #define	VER_NEED_CURRENT	1
 #define	VER_NEED_WEAK	(1u << 15)
 #define	VER_NEED_HIDDEN	VER_NDX_HIDDEN
 #define	VER_NEED_IDX(x)	VER_NDX(x)
 
 #define	VER_NDX_LOCAL	0
 #define	VER_NDX_GLOBAL	1
 #define	VER_NDX_GIVEN	2
 
 #define	VER_NDX_HIDDEN	(1u << 15)
 #define	VER_NDX(x)	((x) & ~(1u << 15))
 
 #define	CA_SUNW_NULL	0
 #define	CA_SUNW_HW_1	1		/* first hardware capabilities entry */
 #define	CA_SUNW_SF_1	2		/* first software capabilities entry */
 
 /*
  * Syminfo flag values
  */
 #define	SYMINFO_FLG_DIRECT	0x0001	/* symbol ref has direct association */
 					/*	to object containing defn. */
 #define	SYMINFO_FLG_PASSTHRU	0x0002	/* ignored - see SYMINFO_FLG_FILTER */
 #define	SYMINFO_FLG_COPY	0x0004	/* symbol is a copy-reloc */
 #define	SYMINFO_FLG_LAZYLOAD	0x0008	/* object containing defn should be */
 					/*	lazily-loaded */
 #define	SYMINFO_FLG_DIRECTBIND	0x0010	/* ref should be bound directly to */
 					/*	object containing defn. */
 #define	SYMINFO_FLG_NOEXTDIRECT	0x0020	/* don't let an external reference */
 					/*	directly bind to this symbol */
 #define	SYMINFO_FLG_FILTER	0x0002	/* symbol ref is associated to a */
 #define	SYMINFO_FLG_AUXILIARY	0x0040	/* 	standard or auxiliary filter */
 
 /*
  * Syminfo.si_boundto values.
  */
 #define	SYMINFO_BT_SELF		0xffff	/* symbol bound to self */
 #define	SYMINFO_BT_PARENT	0xfffe	/* symbol bound to parent */
 #define	SYMINFO_BT_NONE		0xfffd	/* no special symbol binding */
 #define	SYMINFO_BT_EXTERN	0xfffc	/* symbol defined as external */
 #define	SYMINFO_BT_LOWRESERVE	0xff00	/* beginning of reserved entries */
 
 /*
  * Syminfo version values.
  */
 #define	SYMINFO_NONE		0	/* Syminfo version */
 #define	SYMINFO_CURRENT		1
 #define	SYMINFO_NUM		2
 
 /* Values for ch_type (compressed section headers). */
 #define	ELFCOMPRESS_ZLIB	1	/* ZLIB/DEFLATE */
 #define	ELFCOMPRESS_LOOS	0x60000000	/* OS-specific */
 #define	ELFCOMPRESS_HIOS	0x6fffffff
 #define	ELFCOMPRESS_LOPROC	0x70000000	/* Processor-specific */
 #define	ELFCOMPRESS_HIPROC	0x7fffffff
 
 /* Values for a_type. */
 #define	AT_NULL		0	/* Terminates the vector. */
 #define	AT_IGNORE	1	/* Ignored entry. */
 #define	AT_EXECFD	2	/* File descriptor of program to load. */
 #define	AT_PHDR		3	/* Program header of program already loaded. */
 #define	AT_PHENT	4	/* Size of each program header entry. */
 #define	AT_PHNUM	5	/* Number of program header entries. */
 #define	AT_PAGESZ	6	/* Page size in bytes. */
 #define	AT_BASE		7	/* Interpreter's base address. */
 #define	AT_FLAGS	8	/* Flags. */
 #define	AT_ENTRY	9	/* Where interpreter should transfer control. */
 #define	AT_NOTELF	10	/* Program is not ELF ?? */
 #define	AT_UID		11	/* Real uid. */
 #define	AT_EUID		12	/* Effective uid. */
 #ifndef __powerpc__
 #define	AT_GID		13	/* Real gid. */
 #define	AT_EGID		14	/* Effective gid. */
 #define	AT_EXECPATH	15	/* Path to the executable. */
 #define	AT_CANARY	16	/* Canary for SSP. */
 #define	AT_CANARYLEN	17	/* Length of the canary. */
 #define	AT_OSRELDATE	18	/* OSRELDATE. */
 #define	AT_NCPUS	19	/* Number of CPUs. */
 #define	AT_PAGESIZES	20	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
 #else /* defined(__powerpc__) */
 #define	AT_EXECPATH	13
 #define	AT_CANARY	14
 #define	AT_CANARYLEN	15
 #define	AT_OSRELDATE	16
 #define	AT_NCPUS	17
 #define	AT_PAGESIZES	18
 #define	AT_PAGESIZESLEN	19
 #define	AT_STACKPROT	21
 #endif /* defined(__powerpc__) */
 #define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #ifndef __powerpc__
 #define	AT_STACKPROT	23	/* Initial stack protection. */
 #endif
 #define	AT_EHDRFLAGS	24	/* e_flags field from elf hdr */
 #define	AT_HWCAP	25	/* CPU feature flags. */
 #define	AT_HWCAP2	26	/* CPU feature flags 2. */
 
 #define	AT_COUNT	27	/* Count of defined aux entry types. */
 
 /*
  * Relocation types.
  *
  * All machine architectures are defined here to allow tools on one to
  * handle others.
  */
 
 #define	R_386_NONE		0	/* No relocation. */
 #define	R_386_32		1	/* Add symbol value. */
 #define	R_386_PC32		2	/* Add PC-relative symbol value. */
 #define	R_386_GOT32		3	/* Add PC-relative GOT offset. */
 #define	R_386_PLT32		4	/* Add PC-relative PLT offset. */
 #define	R_386_COPY		5	/* Copy data from shared object. */
 #define	R_386_GLOB_DAT		6	/* Set GOT entry to data address. */
 #define	R_386_JMP_SLOT		7	/* Set GOT entry to code address. */
 #define	R_386_RELATIVE		8	/* Add load address of shared object. */
 #define	R_386_GOTOFF		9	/* Add GOT-relative symbol address. */
 #define	R_386_GOTPC		10	/* Add PC-relative GOT table address. */
 #define	R_386_TLS_TPOFF		14	/* Negative offset in static TLS block */
 #define	R_386_TLS_IE		15	/* Absolute address of GOT for -ve static TLS */
 #define	R_386_TLS_GOTIE		16	/* GOT entry for negative static TLS block */
 #define	R_386_TLS_LE		17	/* Negative offset relative to static TLS */
 #define	R_386_TLS_GD		18	/* 32 bit offset to GOT (index,off) pair */
 #define	R_386_TLS_LDM		19	/* 32 bit offset to GOT (index,zero) pair */
 #define	R_386_TLS_GD_32		24	/* 32 bit offset to GOT (index,off) pair */
 #define	R_386_TLS_GD_PUSH	25	/* pushl instruction for Sun ABI GD sequence */
 #define	R_386_TLS_GD_CALL	26	/* call instruction for Sun ABI GD sequence */
 #define	R_386_TLS_GD_POP	27	/* popl instruction for Sun ABI GD sequence */
 #define	R_386_TLS_LDM_32	28	/* 32 bit offset to GOT (index,zero) pair */
 #define	R_386_TLS_LDM_PUSH	29	/* pushl instruction for Sun ABI LD sequence */
 #define	R_386_TLS_LDM_CALL	30	/* call instruction for Sun ABI LD sequence */
 #define	R_386_TLS_LDM_POP	31	/* popl instruction for Sun ABI LD sequence */
 #define	R_386_TLS_LDO_32	32	/* 32 bit offset from start of TLS block */
 #define	R_386_TLS_IE_32		33	/* 32 bit offset to GOT static TLS offset entry */
 #define	R_386_TLS_LE_32		34	/* 32 bit offset within static TLS block */
 #define	R_386_TLS_DTPMOD32	35	/* GOT entry containing TLS index */
 #define	R_386_TLS_DTPOFF32	36	/* GOT entry containing TLS offset */
 #define	R_386_TLS_TPOFF32	37	/* GOT entry of -ve static TLS offset */
 #define	R_386_IRELATIVE		42	/* PLT entry resolved indirectly at runtime */
 
 #define	R_AARCH64_NONE		0	/* No relocation */
 #define	R_AARCH64_ABS64		257	/* Absolute offset */
 #define	R_AARCH64_ABS32		258	/* Absolute, 32-bit overflow check */
 #define	R_AARCH64_ABS16		259	/* Absolute, 16-bit overflow check */
 #define	R_AARCH64_PREL64	260	/* PC relative */
 #define	R_AARCH64_PREL32	261	/* PC relative, 32-bit overflow check */
 #define	R_AARCH64_PREL16	262	/* PC relative, 16-bit overflow check */
 #define	R_AARCH64_COPY		1024	/* Copy data from shared object */
 #define	R_AARCH64_GLOB_DAT	1025	/* Set GOT entry to data address */
 #define	R_AARCH64_JUMP_SLOT	1026	/* Set GOT entry to code address */
 #define	R_AARCH64_RELATIVE 	1027	/* Add load address of shared object */
 #define	R_AARCH64_TLS_DTPREL64	1028
 #define	R_AARCH64_TLS_DTPMOD64	1029
 #define	R_AARCH64_TLS_TPREL64 	1030
 #define	R_AARCH64_TLSDESC 	1031	/* Identify the TLS descriptor */
 #define	R_AARCH64_IRELATIVE	1032
 
 #define	R_ARM_NONE		0	/* No relocation. */
 #define	R_ARM_PC24		1
 #define	R_ARM_ABS32		2
 #define	R_ARM_REL32		3
 #define	R_ARM_PC13		4
 #define	R_ARM_ABS16		5
 #define	R_ARM_ABS12		6
 #define	R_ARM_THM_ABS5		7
 #define	R_ARM_ABS8		8
 #define	R_ARM_SBREL32		9
 #define	R_ARM_THM_PC22		10
 #define	R_ARM_THM_PC8		11
 #define	R_ARM_AMP_VCALL9	12
 #define	R_ARM_SWI24		13
 #define	R_ARM_THM_SWI8		14
 #define	R_ARM_XPC25		15
 #define	R_ARM_THM_XPC22		16
 /* TLS relocations */
 #define	R_ARM_TLS_DTPMOD32	17	/* ID of module containing symbol */
 #define	R_ARM_TLS_DTPOFF32	18	/* Offset in TLS block */
 #define	R_ARM_TLS_TPOFF32	19	/* Offset in static TLS block */
 #define	R_ARM_COPY		20	/* Copy data from shared object. */
 #define	R_ARM_GLOB_DAT		21	/* Set GOT entry to data address. */
 #define	R_ARM_JUMP_SLOT		22	/* Set GOT entry to code address. */
 #define	R_ARM_RELATIVE		23	/* Add load address of shared object. */
 #define	R_ARM_GOTOFF		24	/* Add GOT-relative symbol address. */
 #define	R_ARM_GOTPC		25	/* Add PC-relative GOT table address. */
 #define	R_ARM_GOT32		26	/* Add PC-relative GOT offset. */
 #define	R_ARM_PLT32		27	/* Add PC-relative PLT offset. */
 #define	R_ARM_GNU_VTENTRY	100
 #define	R_ARM_GNU_VTINHERIT	101
 #define	R_ARM_RSBREL32		250
 #define	R_ARM_THM_RPC22		251
 #define	R_ARM_RREL32		252
 #define	R_ARM_RABS32		253
 #define	R_ARM_RPC24		254
 #define	R_ARM_RBASE		255
 
 /*	Name			Value	   Field	Calculation */
 #define	R_IA_64_NONE		0	/* None */
 #define	R_IA_64_IMM14		0x21	/* immediate14	S + A */
 #define	R_IA_64_IMM22		0x22	/* immediate22	S + A */
 #define	R_IA_64_IMM64		0x23	/* immediate64	S + A */
 #define	R_IA_64_DIR32MSB	0x24	/* word32 MSB	S + A */
 #define	R_IA_64_DIR32LSB	0x25	/* word32 LSB	S + A */
 #define	R_IA_64_DIR64MSB	0x26	/* word64 MSB	S + A */
 #define	R_IA_64_DIR64LSB	0x27	/* word64 LSB	S + A */
 #define	R_IA_64_GPREL22		0x2a	/* immediate22	@gprel(S + A) */
 #define	R_IA_64_GPREL64I	0x2b	/* immediate64	@gprel(S + A) */
 #define	R_IA_64_GPREL32MSB	0x2c	/* word32 MSB	@gprel(S + A) */
 #define	R_IA_64_GPREL32LSB	0x2d	/* word32 LSB	@gprel(S + A) */
 #define	R_IA_64_GPREL64MSB	0x2e	/* word64 MSB	@gprel(S + A) */
 #define	R_IA_64_GPREL64LSB	0x2f	/* word64 LSB	@gprel(S + A) */
 #define	R_IA_64_LTOFF22		0x32	/* immediate22	@ltoff(S + A) */
 #define	R_IA_64_LTOFF64I	0x33	/* immediate64	@ltoff(S + A) */
 #define	R_IA_64_PLTOFF22	0x3a	/* immediate22	@pltoff(S + A) */
 #define	R_IA_64_PLTOFF64I	0x3b	/* immediate64	@pltoff(S + A) */
 #define	R_IA_64_PLTOFF64MSB	0x3e	/* word64 MSB	@pltoff(S + A) */
 #define	R_IA_64_PLTOFF64LSB	0x3f	/* word64 LSB	@pltoff(S + A) */
 #define	R_IA_64_FPTR64I		0x43	/* immediate64	@fptr(S + A) */
 #define	R_IA_64_FPTR32MSB	0x44	/* word32 MSB	@fptr(S + A) */
 #define	R_IA_64_FPTR32LSB	0x45	/* word32 LSB	@fptr(S + A) */
 #define	R_IA_64_FPTR64MSB	0x46	/* word64 MSB	@fptr(S + A) */
 #define	R_IA_64_FPTR64LSB	0x47	/* word64 LSB	@fptr(S + A) */
 #define	R_IA_64_PCREL60B	0x48	/* immediate60 form1 S + A - P */
 #define	R_IA_64_PCREL21B	0x49	/* immediate21 form1 S + A - P */
 #define	R_IA_64_PCREL21M	0x4a	/* immediate21 form2 S + A - P */
 #define	R_IA_64_PCREL21F	0x4b	/* immediate21 form3 S + A - P */
 #define	R_IA_64_PCREL32MSB	0x4c	/* word32 MSB	S + A - P */
 #define	R_IA_64_PCREL32LSB	0x4d	/* word32 LSB	S + A - P */
 #define	R_IA_64_PCREL64MSB	0x4e	/* word64 MSB	S + A - P */
 #define	R_IA_64_PCREL64LSB	0x4f	/* word64 LSB	S + A - P */
 #define	R_IA_64_LTOFF_FPTR22	0x52	/* immediate22	@ltoff(@fptr(S + A)) */
 #define	R_IA_64_LTOFF_FPTR64I	0x53	/* immediate64	@ltoff(@fptr(S + A)) */
 #define	R_IA_64_LTOFF_FPTR32MSB	0x54	/* word32 MSB	@ltoff(@fptr(S + A)) */
 #define	R_IA_64_LTOFF_FPTR32LSB	0x55	/* word32 LSB	@ltoff(@fptr(S + A)) */
 #define	R_IA_64_LTOFF_FPTR64MSB	0x56	/* word64 MSB	@ltoff(@fptr(S + A)) */
 #define	R_IA_64_LTOFF_FPTR64LSB	0x57	/* word64 LSB	@ltoff(@fptr(S + A)) */
 #define	R_IA_64_SEGREL32MSB	0x5c	/* word32 MSB	@segrel(S + A) */
 #define	R_IA_64_SEGREL32LSB	0x5d	/* word32 LSB	@segrel(S + A) */
 #define	R_IA_64_SEGREL64MSB	0x5e	/* word64 MSB	@segrel(S + A) */
 #define	R_IA_64_SEGREL64LSB	0x5f	/* word64 LSB	@segrel(S + A) */
 #define	R_IA_64_SECREL32MSB	0x64	/* word32 MSB	@secrel(S + A) */
 #define	R_IA_64_SECREL32LSB	0x65	/* word32 LSB	@secrel(S + A) */
 #define	R_IA_64_SECREL64MSB	0x66	/* word64 MSB	@secrel(S + A) */
 #define	R_IA_64_SECREL64LSB	0x67	/* word64 LSB	@secrel(S + A) */
 #define	R_IA_64_REL32MSB	0x6c	/* word32 MSB	BD + A */
 #define	R_IA_64_REL32LSB	0x6d	/* word32 LSB	BD + A */
 #define	R_IA_64_REL64MSB	0x6e	/* word64 MSB	BD + A */
 #define	R_IA_64_REL64LSB	0x6f	/* word64 LSB	BD + A */
 #define	R_IA_64_LTV32MSB	0x74	/* word32 MSB	S + A */
 #define	R_IA_64_LTV32LSB	0x75	/* word32 LSB	S + A */
 #define	R_IA_64_LTV64MSB	0x76	/* word64 MSB	S + A */
 #define	R_IA_64_LTV64LSB	0x77	/* word64 LSB	S + A */
 #define	R_IA_64_PCREL21BI	0x79	/* immediate21 form1 S + A - P */
 #define	R_IA_64_PCREL22		0x7a	/* immediate22	S + A - P */
 #define	R_IA_64_PCREL64I	0x7b	/* immediate64	S + A - P */
 #define	R_IA_64_IPLTMSB		0x80	/* function descriptor MSB special */
 #define	R_IA_64_IPLTLSB		0x81	/* function descriptor LSB speciaal */
 #define	R_IA_64_SUB		0x85	/* immediate64	A - S */
 #define	R_IA_64_LTOFF22X	0x86	/* immediate22	special */
 #define	R_IA_64_LDXMOV		0x87	/* immediate22	special */
 #define	R_IA_64_TPREL14		0x91	/* imm14	@tprel(S + A) */
 #define	R_IA_64_TPREL22		0x92	/* imm22	@tprel(S + A) */
 #define	R_IA_64_TPREL64I	0x93	/* imm64	@tprel(S + A) */
 #define	R_IA_64_TPREL64MSB	0x96	/* word64 MSB	@tprel(S + A) */
 #define	R_IA_64_TPREL64LSB	0x97	/* word64 LSB	@tprel(S + A) */
 #define	R_IA_64_LTOFF_TPREL22	0x9a	/* imm22	@ltoff(@tprel(S+A)) */
 #define	R_IA_64_DTPMOD64MSB	0xa6	/* word64 MSB	@dtpmod(S + A) */
 #define	R_IA_64_DTPMOD64LSB	0xa7	/* word64 LSB	@dtpmod(S + A) */
 #define	R_IA_64_LTOFF_DTPMOD22	0xaa	/* imm22	@ltoff(@dtpmod(S+A)) */
 #define	R_IA_64_DTPREL14	0xb1	/* imm14	@dtprel(S + A) */
 #define	R_IA_64_DTPREL22	0xb2	/* imm22	@dtprel(S + A) */
 #define	R_IA_64_DTPREL64I	0xb3	/* imm64	@dtprel(S + A) */
 #define	R_IA_64_DTPREL32MSB	0xb4	/* word32 MSB	@dtprel(S + A) */
 #define	R_IA_64_DTPREL32LSB	0xb5	/* word32 LSB	@dtprel(S + A) */
 #define	R_IA_64_DTPREL64MSB	0xb6	/* word64 MSB	@dtprel(S + A) */
 #define	R_IA_64_DTPREL64LSB	0xb7	/* word64 LSB	@dtprel(S + A) */
 #define	R_IA_64_LTOFF_DTPREL22	0xba	/* imm22	@ltoff(@dtprel(S+A)) */
 
 #define	R_MIPS_NONE	0	/* No reloc */
 #define	R_MIPS_16	1	/* Direct 16 bit */
 #define	R_MIPS_32	2	/* Direct 32 bit */
 #define	R_MIPS_REL32	3	/* PC relative 32 bit */
 #define	R_MIPS_26	4	/* Direct 26 bit shifted */
 #define	R_MIPS_HI16	5	/* High 16 bit */
 #define	R_MIPS_LO16	6	/* Low 16 bit */
 #define	R_MIPS_GPREL16	7	/* GP relative 16 bit */
 #define	R_MIPS_LITERAL	8	/* 16 bit literal entry */
 #define	R_MIPS_GOT16	9	/* 16 bit GOT entry */
 #define	R_MIPS_PC16	10	/* PC relative 16 bit */
 #define	R_MIPS_CALL16	11	/* 16 bit GOT entry for function */
 #define	R_MIPS_GPREL32	12	/* GP relative 32 bit */
 #define	R_MIPS_64	18	/* Direct 64 bit */
 #define	R_MIPS_GOT_DISP	19
 #define	R_MIPS_GOT_PAGE	20
 #define	R_MIPS_GOT_OFST	21
 #define	R_MIPS_GOT_HI16	22	/* GOT HI 16 bit */
 #define	R_MIPS_GOT_LO16	23	/* GOT LO 16 bit */
 #define	R_MIPS_SUB	24
 #define	R_MIPS_CALLHI16 30	/* upper 16 bit GOT entry for function */
 #define	R_MIPS_CALLLO16 31	/* lower 16 bit GOT entry for function */
 #define	R_MIPS_JALR	37
 #define	R_MIPS_TLS_GD	42
 #define	R_MIPS_COPY	126
 #define	R_MIPS_JUMP_SLOT	127
 
 #define	R_PPC_NONE		0	/* No relocation. */
 #define	R_PPC_ADDR32		1
 #define	R_PPC_ADDR24		2
 #define	R_PPC_ADDR16		3
 #define	R_PPC_ADDR16_LO		4
 #define	R_PPC_ADDR16_HI		5
 #define	R_PPC_ADDR16_HA		6
 #define	R_PPC_ADDR14		7
 #define	R_PPC_ADDR14_BRTAKEN	8
 #define	R_PPC_ADDR14_BRNTAKEN	9
 #define	R_PPC_REL24		10
 #define	R_PPC_REL14		11
 #define	R_PPC_REL14_BRTAKEN	12
 #define	R_PPC_REL14_BRNTAKEN	13
 #define	R_PPC_GOT16		14
 #define	R_PPC_GOT16_LO		15
 #define	R_PPC_GOT16_HI		16
 #define	R_PPC_GOT16_HA		17
 #define	R_PPC_PLTREL24		18
 #define	R_PPC_COPY		19
 #define	R_PPC_GLOB_DAT		20
 #define	R_PPC_JMP_SLOT		21
 #define	R_PPC_RELATIVE		22
 #define	R_PPC_LOCAL24PC		23
 #define	R_PPC_UADDR32		24
 #define	R_PPC_UADDR16		25
 #define	R_PPC_REL32		26
 #define	R_PPC_PLT32		27
 #define	R_PPC_PLTREL32		28
 #define	R_PPC_PLT16_LO		29
 #define	R_PPC_PLT16_HI		30
 #define	R_PPC_PLT16_HA		31
 #define	R_PPC_SDAREL16		32
 #define	R_PPC_SECTOFF		33
 #define	R_PPC_SECTOFF_LO	34
 #define	R_PPC_SECTOFF_HI	35
 #define	R_PPC_SECTOFF_HA	36
 #define	R_PPC_IRELATIVE		248
 
 /*
  * 64-bit relocations
  */
 #define	R_PPC64_ADDR64		38
 #define	R_PPC64_ADDR16_HIGHER	39
 #define	R_PPC64_ADDR16_HIGHERA	40
 #define	R_PPC64_ADDR16_HIGHEST	41
 #define	R_PPC64_ADDR16_HIGHESTA	42
 #define	R_PPC64_UADDR64		43
 #define	R_PPC64_REL64		44
 #define	R_PPC64_PLT64		45
 #define	R_PPC64_PLTREL64	46
 #define	R_PPC64_TOC16		47
 #define	R_PPC64_TOC16_LO	48
 #define	R_PPC64_TOC16_HI	49
 #define	R_PPC64_TOC16_HA	50
 #define	R_PPC64_TOC		51
 #define	R_PPC64_DTPMOD64	68
 #define	R_PPC64_TPREL64		73
 #define	R_PPC64_DTPREL64	78
 
 /*
  * TLS relocations
  */
 #define	R_PPC_TLS		67
 #define	R_PPC_DTPMOD32		68
 #define	R_PPC_TPREL16		69
 #define	R_PPC_TPREL16_LO	70
 #define	R_PPC_TPREL16_HI	71
 #define	R_PPC_TPREL16_HA	72
 #define	R_PPC_TPREL32		73
 #define	R_PPC_DTPREL16		74
 #define	R_PPC_DTPREL16_LO	75
 #define	R_PPC_DTPREL16_HI	76
 #define	R_PPC_DTPREL16_HA	77
 #define	R_PPC_DTPREL32		78
 #define	R_PPC_GOT_TLSGD16	79
 #define	R_PPC_GOT_TLSGD16_LO	80
 #define	R_PPC_GOT_TLSGD16_HI	81
 #define	R_PPC_GOT_TLSGD16_HA	82
 #define	R_PPC_GOT_TLSLD16	83
 #define	R_PPC_GOT_TLSLD16_LO	84
 #define	R_PPC_GOT_TLSLD16_HI	85
 #define	R_PPC_GOT_TLSLD16_HA	86
 #define	R_PPC_GOT_TPREL16	87
 #define	R_PPC_GOT_TPREL16_LO	88
 #define	R_PPC_GOT_TPREL16_HI	89
 #define	R_PPC_GOT_TPREL16_HA	90
 
 /*
  * The remaining relocs are from the Embedded ELF ABI, and are not in the
  *  SVR4 ELF ABI.
  */
 
 #define	R_PPC_EMB_NADDR32	101
 #define	R_PPC_EMB_NADDR16	102
 #define	R_PPC_EMB_NADDR16_LO	103
 #define	R_PPC_EMB_NADDR16_HI	104
 #define	R_PPC_EMB_NADDR16_HA	105
 #define	R_PPC_EMB_SDAI16	106
 #define	R_PPC_EMB_SDA2I16	107
 #define	R_PPC_EMB_SDA2REL	108
 #define	R_PPC_EMB_SDA21		109
 #define	R_PPC_EMB_MRKREF	110
 #define	R_PPC_EMB_RELSEC16	111
 #define	R_PPC_EMB_RELST_LO	112
 #define	R_PPC_EMB_RELST_HI	113
 #define	R_PPC_EMB_RELST_HA	114
 #define	R_PPC_EMB_BIT_FLD	115
 #define	R_PPC_EMB_RELSDA	116
 
 /*
  * RISC-V relocation types.
  */
 
 /* Relocation types used by the dynamic linker. */
 #define	R_RISCV_NONE		0
 #define	R_RISCV_32		1
 #define	R_RISCV_64		2
 #define	R_RISCV_RELATIVE	3
 #define	R_RISCV_COPY		4
 #define	R_RISCV_JUMP_SLOT	5
 #define	R_RISCV_TLS_DTPMOD32	6
 #define	R_RISCV_TLS_DTPMOD64	7
 #define	R_RISCV_TLS_DTPREL32	8
 #define	R_RISCV_TLS_DTPREL64	9
 #define	R_RISCV_TLS_TPREL32	10
 #define	R_RISCV_TLS_TPREL64	11
 
 /* Relocation types not used by the dynamic linker. */
 #define	R_RISCV_BRANCH		16
 #define	R_RISCV_JAL		17
 #define	R_RISCV_CALL		18
 #define	R_RISCV_CALL_PLT	19
 #define	R_RISCV_GOT_HI20	20
 #define	R_RISCV_TLS_GOT_HI20	21
 #define	R_RISCV_TLS_GD_HI20	22
 #define	R_RISCV_PCREL_HI20	23
 #define	R_RISCV_PCREL_LO12_I	24
 #define	R_RISCV_PCREL_LO12_S	25
 #define	R_RISCV_HI20		26
 #define	R_RISCV_LO12_I		27
 #define	R_RISCV_LO12_S		28
 #define	R_RISCV_TPREL_HI20	29
 #define	R_RISCV_TPREL_LO12_I	30
 #define	R_RISCV_TPREL_LO12_S	31
 #define	R_RISCV_TPREL_ADD	32
 #define	R_RISCV_ADD8		33
 #define	R_RISCV_ADD16		34
 #define	R_RISCV_ADD32		35
 #define	R_RISCV_ADD64		36
 #define	R_RISCV_SUB8		37
 #define	R_RISCV_SUB16		38
 #define	R_RISCV_SUB32		39
 #define	R_RISCV_SUB64		40
 #define	R_RISCV_GNU_VTINHERIT	41
 #define	R_RISCV_GNU_VTENTRY	42
 #define	R_RISCV_ALIGN		43
 #define	R_RISCV_RVC_BRANCH	44
 #define	R_RISCV_RVC_JUMP	45
 #define	R_RISCV_RVC_LUI		46
 #define	R_RISCV_GPREL_I		47
 #define	R_RISCV_GPREL_S		48
 #define	R_RISCV_TPREL_I		49
 #define	R_RISCV_TPREL_S		50
 #define	R_RISCV_RELAX		51
 #define	R_RISCV_SUB6		52
 #define	R_RISCV_SET6		53
 #define	R_RISCV_SET8		54
 #define	R_RISCV_SET16		55
 #define	R_RISCV_SET32		56
 
 #define	R_SPARC_NONE		0
 #define	R_SPARC_8		1
 #define	R_SPARC_16		2
 #define	R_SPARC_32		3
 #define	R_SPARC_DISP8		4
 #define	R_SPARC_DISP16		5
 #define	R_SPARC_DISP32		6
 #define	R_SPARC_WDISP30		7
 #define	R_SPARC_WDISP22		8
 #define	R_SPARC_HI22		9
 #define	R_SPARC_22		10
 #define	R_SPARC_13		11
 #define	R_SPARC_LO10		12
 #define	R_SPARC_GOT10		13
 #define	R_SPARC_GOT13		14
 #define	R_SPARC_GOT22		15
 #define	R_SPARC_PC10		16
 #define	R_SPARC_PC22		17
 #define	R_SPARC_WPLT30		18
 #define	R_SPARC_COPY		19
 #define	R_SPARC_GLOB_DAT	20
 #define	R_SPARC_JMP_SLOT	21
 #define	R_SPARC_RELATIVE	22
 #define	R_SPARC_UA32		23
 #define	R_SPARC_PLT32		24
 #define	R_SPARC_HIPLT22		25
 #define	R_SPARC_LOPLT10		26
 #define	R_SPARC_PCPLT32		27
 #define	R_SPARC_PCPLT22		28
 #define	R_SPARC_PCPLT10		29
 #define	R_SPARC_10		30
 #define	R_SPARC_11		31
 #define	R_SPARC_64		32
 #define	R_SPARC_OLO10		33
 #define	R_SPARC_HH22		34
 #define	R_SPARC_HM10		35
 #define	R_SPARC_LM22		36
 #define	R_SPARC_PC_HH22		37
 #define	R_SPARC_PC_HM10		38
 #define	R_SPARC_PC_LM22		39
 #define	R_SPARC_WDISP16		40
 #define	R_SPARC_WDISP19		41
 #define	R_SPARC_GLOB_JMP	42
 #define	R_SPARC_7		43
 #define	R_SPARC_5		44
 #define	R_SPARC_6		45
 #define	R_SPARC_DISP64		46
 #define	R_SPARC_PLT64		47
 #define	R_SPARC_HIX22		48
 #define	R_SPARC_LOX10		49
 #define	R_SPARC_H44		50
 #define	R_SPARC_M44		51
 #define	R_SPARC_L44		52
 #define	R_SPARC_REGISTER	53
 #define	R_SPARC_UA64		54
 #define	R_SPARC_UA16		55
 #define	R_SPARC_TLS_GD_HI22	56
 #define	R_SPARC_TLS_GD_LO10	57
 #define	R_SPARC_TLS_GD_ADD	58
 #define	R_SPARC_TLS_GD_CALL	59
 #define	R_SPARC_TLS_LDM_HI22	60
 #define	R_SPARC_TLS_LDM_LO10	61
 #define	R_SPARC_TLS_LDM_ADD	62
 #define	R_SPARC_TLS_LDM_CALL	63
 #define	R_SPARC_TLS_LDO_HIX22	64
 #define	R_SPARC_TLS_LDO_LOX10	65
 #define	R_SPARC_TLS_LDO_ADD	66
 #define	R_SPARC_TLS_IE_HI22	67
 #define	R_SPARC_TLS_IE_LO10	68
 #define	R_SPARC_TLS_IE_LD	69
 #define	R_SPARC_TLS_IE_LDX	70
 #define	R_SPARC_TLS_IE_ADD	71
 #define	R_SPARC_TLS_LE_HIX22	72
 #define	R_SPARC_TLS_LE_LOX10	73
 #define	R_SPARC_TLS_DTPMOD32	74
 #define	R_SPARC_TLS_DTPMOD64	75
 #define	R_SPARC_TLS_DTPOFF32	76
 #define	R_SPARC_TLS_DTPOFF64	77
 #define	R_SPARC_TLS_TPOFF32	78
 #define	R_SPARC_TLS_TPOFF64	79
 
 #define	R_X86_64_NONE		0	/* No relocation. */
 #define	R_X86_64_64		1	/* Add 64 bit symbol value. */
 #define	R_X86_64_PC32		2	/* PC-relative 32 bit signed sym value. */
 #define	R_X86_64_GOT32		3	/* PC-relative 32 bit GOT offset. */
 #define	R_X86_64_PLT32		4	/* PC-relative 32 bit PLT offset. */
 #define	R_X86_64_COPY		5	/* Copy data from shared object. */
 #define	R_X86_64_GLOB_DAT	6	/* Set GOT entry to data address. */
 #define	R_X86_64_JMP_SLOT	7	/* Set GOT entry to code address. */
 #define	R_X86_64_RELATIVE	8	/* Add load address of shared object. */
 #define	R_X86_64_GOTPCREL	9	/* Add 32 bit signed pcrel offset to GOT. */
 #define	R_X86_64_32		10	/* Add 32 bit zero extended symbol value */
 #define	R_X86_64_32S		11	/* Add 32 bit sign extended symbol value */
 #define	R_X86_64_16		12	/* Add 16 bit zero extended symbol value */
 #define	R_X86_64_PC16		13	/* Add 16 bit signed extended pc relative symbol value */
 #define	R_X86_64_8		14	/* Add 8 bit zero extended symbol value */
 #define	R_X86_64_PC8		15	/* Add 8 bit signed extended pc relative symbol value */
 #define	R_X86_64_DTPMOD64	16	/* ID of module containing symbol */
 #define	R_X86_64_DTPOFF64	17	/* Offset in TLS block */
 #define	R_X86_64_TPOFF64	18	/* Offset in static TLS block */
 #define	R_X86_64_TLSGD		19	/* PC relative offset to GD GOT entry */
 #define	R_X86_64_TLSLD		20	/* PC relative offset to LD GOT entry */
 #define	R_X86_64_DTPOFF32	21	/* Offset in TLS block */
 #define	R_X86_64_GOTTPOFF	22	/* PC relative offset to IE GOT entry */
 #define	R_X86_64_TPOFF32	23	/* Offset in static TLS block */
 #define	R_X86_64_PC64		24	/* PC-relative 64 bit signed sym value. */
 #define	R_X86_64_GOTOFF64	25
 #define	R_X86_64_GOTPC32	26
 #define	R_X86_64_GOT64		27
 #define	R_X86_64_GOTPCREL64	28
 #define	R_X86_64_GOTPC64	29
 #define	R_X86_64_GOTPLT64	30
 #define	R_X86_64_PLTOFF64	31
 #define	R_X86_64_SIZE32		32
 #define	R_X86_64_SIZE64		33
 #define	R_X86_64_GOTPC32_TLSDESC 34
 #define	R_X86_64_TLSDESC_CALL	35
 #define	R_X86_64_TLSDESC	36
 #define	R_X86_64_IRELATIVE	37
 
 
 #endif /* !_SYS_ELF_COMMON_H_ */
Index: projects/clang900-import/sys/sys/mount.h
===================================================================
--- projects/clang900-import/sys/sys/mount.h	(revision 352586)
+++ projects/clang900-import/sys/sys/mount.h	(revision 352587)
@@ -1,1049 +1,1049 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mount.h	8.21 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MOUNT_H_
 #define _SYS_MOUNT_H_
 
 #include <sys/ucred.h>
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/tslog.h>
 #include <sys/_mutex.h>
 #include <sys/_sx.h>
 #endif
 
 /*
  * NOTE: When changing statfs structure, mount structure, MNT_* flags or
  * MNTK_* flags also update DDB show mount command in vfs_subr.c.
  */
 
 typedef struct fsid { int32_t val[2]; } fsid_t;	/* filesystem id type */
 
 /*
  * File identifier.
  * These are unique per filesystem on a single machine.
  */
 #define	MAXFIDSZ	16
 
 struct fid {
 	u_short		fid_len;		/* length of data in bytes */
 	u_short		fid_data0;		/* force longword alignment */
 	char		fid_data[MAXFIDSZ];	/* data (variable length) */
 };
 
 /*
  * filesystem statistics
  */
 #define	MFSNAMELEN	16		/* length of type name including null */
 #define	MNAMELEN	1024		/* size of on/from name bufs */
 #define	STATFS_VERSION	0x20140518	/* current version number */
 struct statfs {
 	uint32_t f_version;		/* structure version number */
 	uint32_t f_type;		/* type of filesystem */
 	uint64_t f_flags;		/* copy of mount exported flags */
 	uint64_t f_bsize;		/* filesystem fragment size */
 	uint64_t f_iosize;		/* optimal transfer block size */
 	uint64_t f_blocks;		/* total data blocks in filesystem */
 	uint64_t f_bfree;		/* free blocks in filesystem */
 	int64_t	 f_bavail;		/* free blocks avail to non-superuser */
 	uint64_t f_files;		/* total file nodes in filesystem */
 	int64_t	 f_ffree;		/* free nodes avail to non-superuser */
 	uint64_t f_syncwrites;		/* count of sync writes since mount */
 	uint64_t f_asyncwrites;		/* count of async writes since mount */
 	uint64_t f_syncreads;		/* count of sync reads since mount */
 	uint64_t f_asyncreads;		/* count of async reads since mount */
 	uint64_t f_spare[10];		/* unused spare */
 	uint32_t f_namemax;		/* maximum filename length */
 	uid_t	  f_owner;		/* user that mounted the filesystem */
 	fsid_t	  f_fsid;		/* filesystem id */
 	char	  f_charspare[80];	    /* spare string space */
 	char	  f_fstypename[MFSNAMELEN]; /* filesystem type name */
 	char	  f_mntfromname[MNAMELEN];  /* mounted filesystem */
 	char	  f_mntonname[MNAMELEN];    /* directory on which mounted */
 };
 
 #if defined(_WANT_FREEBSD11_STATFS) || defined(_KERNEL)
 #define	FREEBSD11_STATFS_VERSION	0x20030518 /* current version number */
 struct freebsd11_statfs {
 	uint32_t f_version;		/* structure version number */
 	uint32_t f_type;		/* type of filesystem */
 	uint64_t f_flags;		/* copy of mount exported flags */
 	uint64_t f_bsize;		/* filesystem fragment size */
 	uint64_t f_iosize;		/* optimal transfer block size */
 	uint64_t f_blocks;		/* total data blocks in filesystem */
 	uint64_t f_bfree;		/* free blocks in filesystem */
 	int64_t	 f_bavail;		/* free blocks avail to non-superuser */
 	uint64_t f_files;		/* total file nodes in filesystem */
 	int64_t	 f_ffree;		/* free nodes avail to non-superuser */
 	uint64_t f_syncwrites;		/* count of sync writes since mount */
 	uint64_t f_asyncwrites;		/* count of async writes since mount */
 	uint64_t f_syncreads;		/* count of sync reads since mount */
 	uint64_t f_asyncreads;		/* count of async reads since mount */
 	uint64_t f_spare[10];		/* unused spare */
 	uint32_t f_namemax;		/* maximum filename length */
 	uid_t	  f_owner;		/* user that mounted the filesystem */
 	fsid_t	  f_fsid;		/* filesystem id */
 	char	  f_charspare[80];	/* spare string space */
 	char	  f_fstypename[16];	/* filesystem type name */
 	char	  f_mntfromname[88];	/* mounted filesystem */
 	char	  f_mntonname[88];	/* directory on which mounted */
 };
 #endif /* _WANT_FREEBSD11_STATFS || _KERNEL */
 
 #ifdef _KERNEL
 #define	OMFSNAMELEN	16	/* length of fs type name, including null */
 #define	OMNAMELEN	(88 - 2 * sizeof(long))	/* size of on/from name bufs */
 
 /* XXX getfsstat.2 is out of date with write and read counter changes here. */
 /* XXX statfs.2 is out of date with read counter changes here. */
 struct ostatfs {
 	long	f_spare2;		/* placeholder */
 	long	f_bsize;		/* fundamental filesystem block size */
 	long	f_iosize;		/* optimal transfer block size */
 	long	f_blocks;		/* total data blocks in filesystem */
 	long	f_bfree;		/* free blocks in fs */
 	long	f_bavail;		/* free blocks avail to non-superuser */
 	long	f_files;		/* total file nodes in filesystem */
 	long	f_ffree;		/* free file nodes in fs */
 	fsid_t	f_fsid;			/* filesystem id */
 	uid_t	f_owner;		/* user that mounted the filesystem */
 	int	f_type;			/* type of filesystem */
 	int	f_flags;		/* copy of mount exported flags */
 	long	f_syncwrites;		/* count of sync writes since mount */
 	long	f_asyncwrites;		/* count of async writes since mount */
 	char	f_fstypename[OMFSNAMELEN]; /* fs type name */
 	char	f_mntonname[OMNAMELEN];	/* directory on which mounted */
 	long	f_syncreads;		/* count of sync reads since mount */
 	long	f_asyncreads;		/* count of async reads since mount */
 	short	f_spares1;		/* unused spare */
 	char	f_mntfromname[OMNAMELEN];/* mounted filesystem */
 	short	f_spares2;		/* unused spare */
 	/*
 	 * XXX on machines where longs are aligned to 8-byte boundaries, there
 	 * is an unnamed int32_t here.  This spare was after the apparent end
 	 * of the struct until we bit off the read counters from f_mntonname.
 	 */
 	long	f_spare[2];		/* unused spare */
 };
 
 TAILQ_HEAD(vnodelst, vnode);
 
 /* Mount options list */
 TAILQ_HEAD(vfsoptlist, vfsopt);
 struct vfsopt {
 	TAILQ_ENTRY(vfsopt) link;
 	char	*name;
 	void	*value;
 	int	len;
 	int	pos;
 	int	seen;
 };
 
 /*
  * Structure per mounted filesystem.  Each mounted filesystem has an
  * array of operations and an instance record.  The filesystems are
  * put on a doubly linked list.
  *
  * Lock reference:
  * 	l - mnt_listmtx
  *	m - mountlist_mtx
  *	i - interlock
  *	v - vnode freelist mutex
  *
  * Unmarked fields are considered stable as long as a ref is held.
  *
  */
 struct mount {
 	struct mtx	mnt_mtx;		/* mount structure interlock */
 	int		mnt_gen;		/* struct mount generation */
 #define	mnt_startzero	mnt_list
 	TAILQ_ENTRY(mount) mnt_list;		/* (m) mount list */
 	struct vfsops	*mnt_op;		/* operations on fs */
 	struct vfsconf	*mnt_vfc;		/* configuration info */
 	struct vnode	*mnt_vnodecovered;	/* vnode we mounted on */
 	struct vnode	*mnt_syncer;		/* syncer vnode */
 	int		mnt_ref;		/* (i) Reference count */
 	struct vnodelst	mnt_nvnodelist;		/* (i) list of vnodes */
 	int		mnt_nvnodelistsize;	/* (i) # of vnodes */
 	int		mnt_writeopcount;	/* (i) write syscalls pending */
 	int		mnt_kern_flag;		/* (i) kernel only flags */
 	uint64_t	mnt_flag;		/* (i) flags shared with user */
 	struct vfsoptlist *mnt_opt;		/* current mount options */
 	struct vfsoptlist *mnt_optnew;		/* new options passed to fs */
 	int		mnt_maxsymlinklen;	/* max size of short symlink */
 	struct statfs	mnt_stat;		/* cache of filesystem stats */
 	struct ucred	*mnt_cred;		/* credentials of mounter */
 	void *		mnt_data;		/* private data */
 	time_t		mnt_time;		/* last time written*/
 	int		mnt_iosize_max;		/* max size for clusters, etc */
 	struct netexport *mnt_export;		/* export list */
 	struct label	*mnt_label;		/* MAC label for the fs */
 	u_int		mnt_hashseed;		/* Random seed for vfs_hash */
 	int		mnt_lockref;		/* (i) Lock reference count */
 	int		mnt_secondary_writes;   /* (i) # of secondary writes */
 	int		mnt_secondary_accwrites;/* (i) secondary wr. starts */
 	struct thread	*mnt_susp_owner;	/* (i) thread owning suspension */
 #define	mnt_endzero	mnt_gjprovider
 	char		*mnt_gjprovider;	/* gjournal provider name */
 	struct mtx	mnt_listmtx;
 	struct vnodelst	mnt_activevnodelist;	/* (l) list of active vnodes */
 	int		mnt_activevnodelistsize;/* (l) # of active vnodes */
 	struct vnodelst	mnt_tmpfreevnodelist;	/* (l) list of free vnodes */
 	int		mnt_tmpfreevnodelistsize;/* (l) # of free vnodes */
 	struct lock	mnt_explock;		/* vfs_export walkers lock */
 	TAILQ_ENTRY(mount) mnt_upper_link;	/* (m) we in the all uppers */
 	TAILQ_HEAD(, mount) mnt_uppers;		/* (m) upper mounts over us*/
-	int		mnt_vfs_ops;		/* (i) pending vfs ops */
+	int __aligned(CACHE_LINE_SIZE)	mnt_vfs_ops;/* (i) pending vfs ops */
 	int		*mnt_thread_in_ops_pcpu;
 	int		*mnt_ref_pcpu;
 	int		*mnt_lockref_pcpu;
 	int		*mnt_writeopcount_pcpu;
 };
 
 /*
  * Definitions for MNT_VNODE_FOREACH_ALL.
  */
 struct vnode *__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp);
 struct vnode *__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp);
 void          __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp);
 
 #define MNT_VNODE_FOREACH_ALL(vp, mp, mvp)				\
 	for (vp = __mnt_vnode_first_all(&(mvp), (mp));			\
 		(vp) != NULL; vp = __mnt_vnode_next_all(&(mvp), (mp)))
 
 #define MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp)				\
 	do {								\
 		MNT_ILOCK(mp);						\
 		__mnt_vnode_markerfree_all(&(mvp), (mp));		\
 		/* MNT_IUNLOCK(mp); -- done in above function */	\
 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);			\
 	} while (0)
 
 /*
  * Definitions for MNT_VNODE_FOREACH_ACTIVE.
  */
 struct vnode *__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp);
 struct vnode *__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp);
 void          __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *);
 
 #define MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) 				\
 	for (vp = __mnt_vnode_first_active(&(mvp), (mp)); 		\
 		(vp) != NULL; vp = __mnt_vnode_next_active(&(mvp), (mp)))
 
 #define MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp)				\
 	__mnt_vnode_markerfree_active(&(mvp), (mp))
 
 #define	MNT_ILOCK(mp)	mtx_lock(&(mp)->mnt_mtx)
 #define	MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx)
 #define	MNT_IUNLOCK(mp)	mtx_unlock(&(mp)->mnt_mtx)
 #define	MNT_MTX(mp)	(&(mp)->mnt_mtx)
 
 #define	MNT_REF(mp)	do {						\
 	mtx_assert(MNT_MTX(mp), MA_OWNED);				\
 	mp->mnt_ref++;							\
 } while (0)
 #define	MNT_REL(mp)	do {						\
 	mtx_assert(MNT_MTX(mp), MA_OWNED);				\
 	(mp)->mnt_ref--;						\
 	if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0)		\
 		vfs_dump_mount_counters(mp);				\
 	if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops)		\
 		wakeup((mp));						\
 } while (0)
 
 #endif /* _KERNEL */
 
 /*
  * User specifiable flags, stored in mnt_flag.
  */
 #define	MNT_RDONLY	0x0000000000000001ULL /* read only filesystem */
 #define	MNT_SYNCHRONOUS	0x0000000000000002ULL /* fs written synchronously */
 #define	MNT_NOEXEC	0x0000000000000004ULL /* can't exec from filesystem */
 #define	MNT_NOSUID	0x0000000000000008ULL /* don't honor setuid fs bits */
 #define	MNT_NFS4ACLS	0x0000000000000010ULL /* enable NFS version 4 ACLs */
 #define	MNT_UNION	0x0000000000000020ULL /* union with underlying fs */
 #define	MNT_ASYNC	0x0000000000000040ULL /* fs written asynchronously */
 #define	MNT_SUIDDIR	0x0000000000100000ULL /* special SUID dir handling */
 #define	MNT_SOFTDEP	0x0000000000200000ULL /* using soft updates */
 #define	MNT_NOSYMFOLLOW	0x0000000000400000ULL /* do not follow symlinks */
 #define	MNT_GJOURNAL	0x0000000002000000ULL /* GEOM journal support enabled */
 #define	MNT_MULTILABEL	0x0000000004000000ULL /* MAC support for objects */
 #define	MNT_ACLS	0x0000000008000000ULL /* ACL support enabled */
 #define	MNT_NOATIME	0x0000000010000000ULL /* dont update file access time */
 #define	MNT_NOCLUSTERR	0x0000000040000000ULL /* disable cluster read */
 #define	MNT_NOCLUSTERW	0x0000000080000000ULL /* disable cluster write */
 #define	MNT_SUJ		0x0000000100000000ULL /* using journaled soft updates */
 #define	MNT_AUTOMOUNTED	0x0000000200000000ULL /* mounted by automountd(8) */
 #define	MNT_UNTRUSTED	0x0000000800000000ULL /* filesys metadata untrusted */
 
 /*
  * NFS export related mount flags.
  */
 #define	MNT_EXRDONLY	0x0000000000000080ULL	/* exported read only */
 #define	MNT_EXPORTED	0x0000000000000100ULL	/* filesystem is exported */
 #define	MNT_DEFEXPORTED	0x0000000000000200ULL	/* exported to the world */
 #define	MNT_EXPORTANON	0x0000000000000400ULL	/* anon uid mapping for all */
 #define	MNT_EXKERB	0x0000000000000800ULL	/* exported with Kerberos */
 #define	MNT_EXPUBLIC	0x0000000020000000ULL	/* public export (WebNFS) */
 
 /*
  * Flags set by internal operations,
  * but visible to the user.
  * XXX some of these are not quite right.. (I've never seen the root flag set)
  */
 #define	MNT_LOCAL	0x0000000000001000ULL /* filesystem is stored locally */
 #define	MNT_QUOTA	0x0000000000002000ULL /* quotas are enabled on fs */
 #define	MNT_ROOTFS	0x0000000000004000ULL /* identifies the root fs */
 #define	MNT_USER	0x0000000000008000ULL /* mounted by a user */
 #define	MNT_IGNORE	0x0000000000800000ULL /* do not show entry in df */
 #define	MNT_VERIFIED	0x0000000400000000ULL /* filesystem is verified */
 
 /*
  * Mask of flags that are visible to statfs().
  * XXX I think that this could now become (~(MNT_CMDFLAGS))
  * but the 'mount' program may need changing to handle this.
  */
 #define	MNT_VISFLAGMASK	(MNT_RDONLY	| MNT_SYNCHRONOUS | MNT_NOEXEC	| \
 			MNT_NOSUID	| MNT_UNION	| MNT_SUJ	| \
 			MNT_ASYNC	| MNT_EXRDONLY	| MNT_EXPORTED	| \
 			MNT_DEFEXPORTED	| MNT_EXPORTANON| MNT_EXKERB	| \
 			MNT_LOCAL	| MNT_USER	| MNT_QUOTA	| \
 			MNT_ROOTFS	| MNT_NOATIME	| MNT_NOCLUSTERR| \
 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
 			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS	| \
 			MNT_NFS4ACLS	| MNT_AUTOMOUNTED | MNT_VERIFIED | \
 			MNT_UNTRUSTED)
 
 /* Mask of flags that can be updated. */
 #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
 			MNT_SYNCHRONOUS	| MNT_UNION	| MNT_ASYNC	| \
 			MNT_NOATIME | \
 			MNT_NOSYMFOLLOW	| MNT_IGNORE	| \
 			MNT_NOCLUSTERR	| MNT_NOCLUSTERW | MNT_SUIDDIR	| \
 			MNT_ACLS	| MNT_USER	| MNT_NFS4ACLS	| \
 			MNT_AUTOMOUNTED | MNT_UNTRUSTED)
 
 /*
  * External filesystem command modifier flags.
  * Unmount can use the MNT_FORCE flag.
  * XXX: These are not STATES and really should be somewhere else.
  * XXX: MNT_BYFSID and MNT_NONBUSY collide with MNT_ACLS and MNT_MULTILABEL,
  *      but because MNT_ACLS and MNT_MULTILABEL are only used for mount(2),
  *      and MNT_BYFSID and MNT_NONBUSY are only used for unmount(2),
  *      it's harmless.
  */
 #define	MNT_UPDATE	0x0000000000010000ULL /* not real mount, just update */
 #define	MNT_DELEXPORT	0x0000000000020000ULL /* delete export host lists */
 #define	MNT_RELOAD	0x0000000000040000ULL /* reload filesystem data */
 #define	MNT_FORCE	0x0000000000080000ULL /* force unmount or readonly */
 #define	MNT_SNAPSHOT	0x0000000001000000ULL /* snapshot the filesystem */
 #define	MNT_NONBUSY	0x0000000004000000ULL /* check vnode use counts. */
 #define	MNT_BYFSID	0x0000000008000000ULL /* specify filesystem by ID. */
 #define MNT_CMDFLAGS   (MNT_UPDATE	| MNT_DELEXPORT	| MNT_RELOAD	| \
 			MNT_FORCE	| MNT_SNAPSHOT	| MNT_NONBUSY	| \
 			MNT_BYFSID)
 /*
  * Internal filesystem control flags stored in mnt_kern_flag.
  *
  * MNTK_UNMOUNT locks the mount entry so that name lookup cannot
  * proceed past the mount point.  This keeps the subtree stable during
  * mounts and unmounts.  When non-forced unmount flushes all vnodes
  * from the mp queue, the MNTK_UNMOUNT flag prevents insmntque() from
  * queueing new vnodes.
  *
  * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while
  * dounmount() is still waiting to lock the mountpoint. This allows
  * the filesystem to cancel operations that might otherwise deadlock
  * with the unmount attempt (used by NFS).
  */
 #define MNTK_UNMOUNTF	0x00000001	/* forced unmount in progress */
 #define MNTK_ASYNC	0x00000002	/* filtered async flag */
 #define MNTK_SOFTDEP	0x00000004	/* async disabled by softdep */
 #define	MNTK_DRAINING	0x00000010	/* lock draining is happening */
 #define	MNTK_REFEXPIRE	0x00000020	/* refcount expiring is happening */
 #define MNTK_EXTENDED_SHARED	0x00000040 /* Allow shared locking for more ops */
 #define	MNTK_SHARED_WRITES	0x00000080 /* Allow shared locking for writes */
 #define	MNTK_NO_IOPF	0x00000100	/* Disallow page faults during reads
 					   and writes. Filesystem shall properly
 					   handle i/o state on EFAULT. */
 #define	MNTK_VGONE_UPPER	0x00000200
 #define	MNTK_VGONE_WAITER	0x00000400
 #define	MNTK_LOOKUP_EXCL_DOTDOT	0x00000800
 #define	MNTK_MARKER		0x00001000
 #define	MNTK_UNMAPPED_BUFS	0x00002000
 #define	MNTK_USES_BCACHE	0x00004000 /* FS uses the buffer cache. */
 #define	MNTK_TEXT_REFS		0x00008000 /* Keep use ref for text */
 #define MNTK_NOASYNC	0x00800000	/* disable async */
 #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
 #define	MNTK_SUSPEND	0x08000000	/* request write suspension */
 #define	MNTK_SUSPEND2	0x04000000	/* block secondary writes */
 #define	MNTK_SUSPENDED	0x10000000	/* write operations are suspended */
 #define	MNTK_NULL_NOCACHE	0x20000000 /* auto disable cache for nullfs
 					      mounts over this fs */
 #define MNTK_LOOKUP_SHARED	0x40000000 /* FS supports shared lock lookups */
 #define	MNTK_NOKNOTE	0x80000000	/* Don't send KNOTEs from VOP hooks */
 
 #ifdef _KERNEL
 static inline int
 MNT_SHARED_WRITES(struct mount *mp)
 {
 
 	return (mp != NULL && (mp->mnt_kern_flag & MNTK_SHARED_WRITES) != 0);
 }
 
 static inline int
 MNT_EXTENDED_SHARED(struct mount *mp)
 {
 
 	return (mp != NULL && (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED) != 0);
 }
 #endif
 
 /*
  * Sysctl CTL_VFS definitions.
  *
  * Second level identifier specifies which filesystem. Second level
  * identifier VFS_VFSCONF returns information about all filesystems.
  * Second level identifier VFS_GENERIC is non-terminal.
  */
 #define	VFS_VFSCONF		0	/* get configured filesystems */
 #define	VFS_GENERIC		0	/* generic filesystem information */
 /*
  * Third level identifiers for VFS_GENERIC are given below; third
  * level identifiers for specific filesystems are given in their
  * mount specific header files.
  */
 #define VFS_MAXTYPENUM	1	/* int: highest defined filesystem type */
 #define VFS_CONF	2	/* struct: vfsconf for filesystem given
 				   as next argument */
 
 /*
  * Flags for various system call interfaces.
  *
  * waitfor flags to vfs_sync() and getfsstat()
  */
 #define MNT_WAIT	1	/* synchronously wait for I/O to complete */
 #define MNT_NOWAIT	2	/* start all I/O, but do not wait for it */
 #define MNT_LAZY	3	/* push data not written by filesystem syncer */
 #define MNT_SUSPEND	4	/* Suspend file system after sync */
 
 /*
  * Generic file handle
  */
 struct fhandle {
 	fsid_t	fh_fsid;	/* Filesystem id of mount point */
 	struct	fid fh_fid;	/* Filesys specific id */
 };
 typedef struct fhandle	fhandle_t;
 
 /*
  * Old export arguments without security flavor list
  */
 struct oexport_args {
 	int	ex_flags;		/* export related flags */
 	uid_t	ex_root;		/* mapping for root uid */
 	struct	xucred ex_anon;		/* mapping for anonymous user */
 	struct	sockaddr *ex_addr;	/* net address to which exported */
 	u_char	ex_addrlen;		/* and the net address length */
 	struct	sockaddr *ex_mask;	/* mask of valid bits in saddr */
 	u_char	ex_masklen;		/* and the smask length */
 	char	*ex_indexfile;		/* index file for WebNFS URLs */
 };
 
 /*
  * Export arguments for local filesystem mount calls.
  */
 #define	MAXSECFLAVORS	5
 struct export_args {
 	int	ex_flags;		/* export related flags */
 	uid_t	ex_root;		/* mapping for root uid */
 	struct	xucred ex_anon;		/* mapping for anonymous user */
 	struct	sockaddr *ex_addr;	/* net address to which exported */
 	u_char	ex_addrlen;		/* and the net address length */
 	struct	sockaddr *ex_mask;	/* mask of valid bits in saddr */
 	u_char	ex_masklen;		/* and the smask length */
 	char	*ex_indexfile;		/* index file for WebNFS URLs */
 	int	ex_numsecflavors;	/* security flavor count */
 	int	ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */
 };
 
 /*
  * Structure holding information for a publicly exported filesystem
  * (WebNFS). Currently the specs allow just for one such filesystem.
  */
 struct nfs_public {
 	int		np_valid;	/* Do we hold valid information */
 	fhandle_t	np_handle;	/* Filehandle for pub fs (internal) */
 	struct mount	*np_mount;	/* Mountpoint of exported fs */
 	char		*np_index;	/* Index file */
 };
 
 /*
  * Filesystem configuration information. One of these exists for each
  * type of filesystem supported by the kernel. These are searched at
  * mount time to identify the requested filesystem.
  *
  * XXX: Never change the first two arguments!
  */
 struct vfsconf {
 	u_int	vfc_version;		/* ABI version number */
 	char	vfc_name[MFSNAMELEN];	/* filesystem type name */
 	struct	vfsops *vfc_vfsops;	/* filesystem operations vector */
 	struct	vfsops *vfc_vfsops_sd;	/* ... signal-deferred */
 	int	vfc_typenum;		/* historic filesystem type number */
 	int	vfc_refcount;		/* number mounted of this type */
 	int	vfc_flags;		/* permanent flags */
 	int	vfc_prison_flag;	/* prison allow.mount.* flag */
 	struct	vfsoptdecl *vfc_opts;	/* mount options */
 	TAILQ_ENTRY(vfsconf) vfc_list;	/* list of vfscons */
 };
 
 /* Userland version of the struct vfsconf. */
 struct xvfsconf {
 	struct	vfsops *vfc_vfsops;	/* filesystem operations vector */
 	char	vfc_name[MFSNAMELEN];	/* filesystem type name */
 	int	vfc_typenum;		/* historic filesystem type number */
 	int	vfc_refcount;		/* number mounted of this type */
 	int	vfc_flags;		/* permanent flags */
 	struct	vfsconf *vfc_next;	/* next in list */
 };
 
 #ifndef BURN_BRIDGES
 struct ovfsconf {
 	void	*vfc_vfsops;
 	char	vfc_name[32];
 	int	vfc_index;
 	int	vfc_refcount;
 	int	vfc_flags;
 };
 #endif
 
 /*
  * NB: these flags refer to IMPLEMENTATION properties, not properties of
  * any actual mounts; i.e., it does not make sense to change the flags.
  */
 #define	VFCF_STATIC	0x00010000	/* statically compiled into kernel */
 #define	VFCF_NETWORK	0x00020000	/* may get data over the network */
 #define	VFCF_READONLY	0x00040000	/* writes are not implemented */
 #define	VFCF_SYNTHETIC	0x00080000	/* data does not represent real files */
 #define	VFCF_LOOPBACK	0x00100000	/* aliases some other mounted FS */
 #define	VFCF_UNICODE	0x00200000	/* stores file names as Unicode */
 #define	VFCF_JAIL	0x00400000	/* can be mounted from within a jail */
 #define	VFCF_DELEGADMIN	0x00800000	/* supports delegated administration */
 #define	VFCF_SBDRY	0x01000000	/* Stop at Boundary: defer stop requests
 					   to kernel->user (AST) transition */
 
 typedef uint32_t fsctlop_t;
 
 struct vfsidctl {
 	int		vc_vers;	/* should be VFSIDCTL_VERS1 (below) */
 	fsid_t		vc_fsid;	/* fsid to operate on */
 	char		vc_fstypename[MFSNAMELEN];
 					/* type of fs 'nfs' or '*' */
 	fsctlop_t	vc_op;		/* operation VFS_CTL_* (below) */
 	void		*vc_ptr;	/* pointer to data structure */
 	size_t		vc_len;		/* sizeof said structure */
 	u_int32_t	vc_spare[12];	/* spare (must be zero) */
 };
 
 /* vfsidctl API version. */
 #define VFS_CTL_VERS1	0x01
 
 /*
  * New style VFS sysctls, do not reuse/conflict with the namespace for
  * private sysctls.
  * All "global" sysctl ops have the 33rd bit set:
  * 0x...1....
  * Private sysctl ops should have the 33rd bit unset.
  */
 #define VFS_CTL_QUERY	0x00010001	/* anything wrong? (vfsquery) */
 #define VFS_CTL_TIMEO	0x00010002	/* set timeout for vfs notification */
 #define VFS_CTL_NOLOCKS	0x00010003	/* disable file locking */
 
 struct vfsquery {
 	u_int32_t	vq_flags;
 	u_int32_t	vq_spare[31];
 };
 
 /* vfsquery flags */
 #define VQ_NOTRESP	0x0001	/* server down */
 #define VQ_NEEDAUTH	0x0002	/* server bad auth */
 #define VQ_LOWDISK	0x0004	/* we're low on space */
 #define VQ_MOUNT	0x0008	/* new filesystem arrived */
 #define VQ_UNMOUNT	0x0010	/* filesystem has left */
 #define VQ_DEAD		0x0020	/* filesystem is dead, needs force unmount */
 #define VQ_ASSIST	0x0040	/* filesystem needs assistance from external
 				   program */
 #define VQ_NOTRESPLOCK	0x0080	/* server lockd down */
 #define VQ_FLAG0100	0x0100	/* placeholder */
 #define VQ_FLAG0200	0x0200	/* placeholder */
 #define VQ_FLAG0400	0x0400	/* placeholder */
 #define VQ_FLAG0800	0x0800	/* placeholder */
 #define VQ_FLAG1000	0x1000	/* placeholder */
 #define VQ_FLAG2000	0x2000	/* placeholder */
 #define VQ_FLAG4000	0x4000	/* placeholder */
 #define VQ_FLAG8000	0x8000	/* placeholder */
 
 #ifdef _KERNEL
 /* Point a sysctl request at a vfsidctl's data. */
 #define VCTLTOREQ(vc, req)						\
 	do {								\
 		(req)->newptr = (vc)->vc_ptr;				\
 		(req)->newlen = (vc)->vc_len;				\
 		(req)->newidx = 0;					\
 	} while (0)
 #endif
 
 struct iovec;
 struct uio;
 
 #ifdef _KERNEL
 
 /*
  * vfs_busy specific flags and mask.
  */
 #define	MBF_NOWAIT	0x01
 #define	MBF_MNTLSTLOCK	0x02
 #define	MBF_MASK	(MBF_NOWAIT | MBF_MNTLSTLOCK)
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_MOUNT);
 MALLOC_DECLARE(M_STATFS);
 #endif
 extern int maxvfsconf;		/* highest defined filesystem type */
 
 TAILQ_HEAD(vfsconfhead, vfsconf);
 extern struct vfsconfhead vfsconf;
 
 /*
  * Operations supported on mounted filesystem.
  */
 struct mount_args;
 struct nameidata;
 struct sysctl_req;
 struct mntarg;
 
 /*
  * N.B., vfs_cmount is the ancient vfsop invoked by the old mount(2) syscall.
  * The new way is vfs_mount.
  *
  * vfs_cmount implementations typically translate arguments from their
  * respective old per-FS structures into the key-value list supported by
  * nmount(2), then use kernel_mount(9) to mimic nmount(2) from kernelspace.
  *
  * Filesystems with mounters that use nmount(2) do not need to and should not
  * implement vfs_cmount.  Hopefully a future cleanup can remove vfs_cmount and
  * mount(2) entirely.
  */
 typedef int vfs_cmount_t(struct mntarg *ma, void *data, uint64_t flags);
 typedef int vfs_unmount_t(struct mount *mp, int mntflags);
 typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp);
 typedef	int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, void *arg);
 typedef	int vfs_statfs_t(struct mount *mp, struct statfs *sbp);
 typedef	int vfs_sync_t(struct mount *mp, int waitfor);
 typedef	int vfs_vget_t(struct mount *mp, ino_t ino, int flags,
 		    struct vnode **vpp);
 typedef	int vfs_fhtovp_t(struct mount *mp, struct fid *fhp,
 		    int flags, struct vnode **vpp);
 typedef	int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam,
 		    int *extflagsp, struct ucred **credanonp,
 		    int *numsecflavors, int **secflavors);
 typedef	int vfs_init_t(struct vfsconf *);
 typedef	int vfs_uninit_t(struct vfsconf *);
 typedef	int vfs_extattrctl_t(struct mount *mp, int cmd,
 		    struct vnode *filename_vp, int attrnamespace,
 		    const char *attrname);
 typedef	int vfs_mount_t(struct mount *mp);
 typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op,
 		    struct sysctl_req *req);
 typedef void vfs_susp_clean_t(struct mount *mp);
 typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp);
 typedef void vfs_purge_t(struct mount *mp);
 
 struct vfsops {
 	vfs_mount_t		*vfs_mount;
 	vfs_cmount_t		*vfs_cmount;
 	vfs_unmount_t		*vfs_unmount;
 	vfs_root_t		*vfs_root;
 	vfs_quotactl_t		*vfs_quotactl;
 	vfs_statfs_t		*vfs_statfs;
 	vfs_sync_t		*vfs_sync;
 	vfs_vget_t		*vfs_vget;
 	vfs_fhtovp_t		*vfs_fhtovp;
 	vfs_checkexp_t		*vfs_checkexp;
 	vfs_init_t		*vfs_init;
 	vfs_uninit_t		*vfs_uninit;
 	vfs_extattrctl_t	*vfs_extattrctl;
 	vfs_sysctl_t		*vfs_sysctl;
 	vfs_susp_clean_t	*vfs_susp_clean;
 	vfs_notify_lowervp_t	*vfs_reclaim_lowervp;
 	vfs_notify_lowervp_t	*vfs_unlink_lowervp;
 	vfs_purge_t		*vfs_purge;
 	vfs_mount_t		*vfs_spare[6];	/* spares for ABI compat */
 };
 
 vfs_statfs_t	__vfs_statfs;
 
 #define	VFS_MOUNT(MP) ({						\
 	int _rc;							\
 									\
 	TSRAW(curthread, TS_ENTER, "VFS_MOUNT", (MP)->mnt_vfc->vfc_name);\
 	_rc = (*(MP)->mnt_op->vfs_mount)(MP);				\
 	TSRAW(curthread, TS_EXIT, "VFS_MOUNT", (MP)->mnt_vfc->vfc_name);\
 	_rc; })
 
 #define	VFS_UNMOUNT(MP, FORCE) ({					\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_unmount)(MP, FORCE);			\
 	_rc; })
 
 #define	VFS_ROOT(MP, FLAGS, VPP) ({					\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP);		\
 	_rc; })
 
 #define	VFS_QUOTACTL(MP, C, U, A) ({					\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A);		\
 	_rc; })
 
 #define	VFS_STATFS(MP, SBP) ({						\
 	int _rc;							\
 									\
 	_rc = __vfs_statfs((MP), (SBP));				\
 	_rc; })
 
 #define	VFS_SYNC(MP, WAIT) ({						\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_sync)(MP, WAIT);			\
 	_rc; })
 
 #define	VFS_VGET(MP, INO, FLAGS, VPP) ({				\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP);		\
 	_rc; })
 
 #define	VFS_FHTOVP(MP, FIDP, FLAGS, VPP) ({				\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, FLAGS, VPP);	\
 	_rc; })
 
 #define	VFS_CHECKEXP(MP, NAM, EXFLG, CRED, NUMSEC, SEC) ({		\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED, NUMSEC,\
 	    SEC);							\
 	_rc; })
 
 #define	VFS_EXTATTRCTL(MP, C, FN, NS, N) ({				\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N);	\
 	_rc; })
 
 #define	VFS_SYSCTL(MP, OP, REQ) ({					\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ);			\
 	_rc; })
 
 #define	VFS_SUSP_CLEAN(MP) do {						\
 	if (*(MP)->mnt_op->vfs_susp_clean != NULL) {			\
 		(*(MP)->mnt_op->vfs_susp_clean)(MP);			\
 	}								\
 } while (0)
 
 #define	VFS_RECLAIM_LOWERVP(MP, VP) do {				\
 	if (*(MP)->mnt_op->vfs_reclaim_lowervp != NULL) {		\
 		(*(MP)->mnt_op->vfs_reclaim_lowervp)((MP), (VP));	\
 	}								\
 } while (0)
 
 #define	VFS_UNLINK_LOWERVP(MP, VP) do {					\
 	if (*(MP)->mnt_op->vfs_unlink_lowervp != NULL) {		\
 		(*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP));	\
 	}								\
 } while (0)
 
 #define	VFS_PURGE(MP) do {						\
 	if (*(MP)->mnt_op->vfs_purge != NULL) {				\
 		(*(MP)->mnt_op->vfs_purge)(MP);				\
 	}								\
 } while (0)
 
 #define VFS_KNOTE_LOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) == 0)				\
 		VN_KNOTE((vp), (hint), KNF_LISTLOCKED);			\
 } while (0)
 
 #define VFS_KNOTE_UNLOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) == 0)				\
 		VN_KNOTE((vp), (hint), 0);				\
 } while (0)
 
 #define	VFS_NOTIFY_UPPER_RECLAIM	1
 #define	VFS_NOTIFY_UPPER_UNLINK		2
 
 #include <sys/module.h>
 
 /*
  * Version numbers.
  */
 #define VFS_VERSION_00	0x19660120
 #define VFS_VERSION_01	0x20121030
 #define VFS_VERSION_02	0x20180504
 #define VFS_VERSION	VFS_VERSION_02
 
 #define VFS_SET(vfsops, fsname, flags) \
 	static struct vfsconf fsname ## _vfsconf = {		\
 		.vfc_version = VFS_VERSION,			\
 		.vfc_name = #fsname,				\
 		.vfc_vfsops = &vfsops,				\
 		.vfc_typenum = -1,				\
 		.vfc_flags = flags,				\
 	};							\
 	static moduledata_t fsname ## _mod = {			\
 		#fsname,					\
 		vfs_modevent,					\
 		& fsname ## _vfsconf				\
 	};							\
 	DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE)
 
 /*
  * exported vnode operations
  */
 
 int	dounmount(struct mount *, int, struct thread *);
 
 int	kernel_mount(struct mntarg *ma, uint64_t flags);
 int	kernel_vmount(int flags, ...);
 struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len);
 struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name);
 struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...);
 struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len);
 void	statfs_scale_blocks(struct statfs *sf, long max_size);
 struct vfsconf *vfs_byname(const char *);
 struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *);
 void	vfs_mount_destroy(struct mount *);
 void	vfs_event_signal(fsid_t *, u_int32_t, intptr_t);
 void	vfs_freeopts(struct vfsoptlist *opts);
 void	vfs_deleteopt(struct vfsoptlist *opts, const char *name);
 int	vfs_buildopts(struct uio *auio, struct vfsoptlist **options);
 int	vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
 	    uint64_t val);
 int	vfs_getopt(struct vfsoptlist *, const char *, void **, int *);
 int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
 int	vfs_getopt_size(struct vfsoptlist *opts, const char *name,
 	    off_t *value);
 char	*vfs_getopts(struct vfsoptlist *, const char *, int *error);
 int	vfs_copyopt(struct vfsoptlist *, const char *, void *, int);
 int	vfs_filteropt(struct vfsoptlist *, const char **legal);
 void	vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...);
 int	vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...);
 int	vfs_setopt(struct vfsoptlist *opts, const char *name, void *value,
 	    int len);
 int	vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value,
 	    int len);
 int	vfs_setopts(struct vfsoptlist *opts, const char *name,
 	    const char *value);
 int	vfs_setpublicfs			    /* set publicly exported fs */
 	    (struct mount *, struct netexport *, struct export_args *);
 void	vfs_msync(struct mount *, int);
 int	vfs_busy(struct mount *, int);
 int	vfs_export			 /* process mount export info */
 	    (struct mount *, struct export_args *);
 void	vfs_allocate_syncvnode(struct mount *);
 void	vfs_deallocate_syncvnode(struct mount *);
 int	vfs_donmount(struct thread *td, uint64_t fsflags,
 	    struct uio *fsoptions);
 void	vfs_getnewfsid(struct mount *);
 struct cdev *vfs_getrootfsid(struct mount *);
 struct	mount *vfs_getvfs(fsid_t *);      /* return vfs given fsid */
 struct	mount *vfs_busyfs(fsid_t *);
 int	vfs_modevent(module_t, int, void *);
 void	vfs_mount_error(struct mount *, const char *, ...);
 void	vfs_mountroot(void);			/* mount our root filesystem */
 void	vfs_mountedfrom(struct mount *, const char *from);
 void	vfs_notify_upper(struct vnode *, int);
 void	vfs_oexport_conv(const struct oexport_args *oexp,
 	    struct export_args *exp);
 void	vfs_ref(struct mount *);
 void	vfs_rel(struct mount *);
 struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *,
 	    struct ucred *);
 int	vfs_suser(struct mount *, struct thread *);
 void	vfs_unbusy(struct mount *);
 void	vfs_unmountall(void);
 extern	TAILQ_HEAD(mntlist, mount) mountlist;	/* mounted filesystem list */
 extern	struct mtx mountlist_mtx;
 extern	struct nfs_public nfs_pub;
 extern	struct sx vfsconf_sx;
 #define	vfsconf_lock()		sx_xlock(&vfsconf_sx)
 #define	vfsconf_unlock()	sx_xunlock(&vfsconf_sx)
 #define	vfsconf_slock()		sx_slock(&vfsconf_sx)
 #define	vfsconf_sunlock()	sx_sunlock(&vfsconf_sx)
 
 /*
  * Declarations for these vfs default operations are located in
  * kern/vfs_default.c.  They will be automatically used to replace
  * null entries in VFS ops tables when registering a new filesystem
  * type in the global table.
  */
 vfs_root_t		vfs_stdroot;
 vfs_quotactl_t		vfs_stdquotactl;
 vfs_statfs_t		vfs_stdstatfs;
 vfs_sync_t		vfs_stdsync;
 vfs_sync_t		vfs_stdnosync;
 vfs_vget_t		vfs_stdvget;
 vfs_fhtovp_t		vfs_stdfhtovp;
 vfs_checkexp_t		vfs_stdcheckexp;
 vfs_init_t		vfs_stdinit;
 vfs_uninit_t		vfs_stduninit;
 vfs_extattrctl_t	vfs_stdextattrctl;
 vfs_sysctl_t		vfs_stdsysctl;
 
 void	syncer_suspend(void);
 void	syncer_resume(void);
 
 void	vfs_op_barrier_wait(struct mount *);
 void	vfs_op_enter(struct mount *);
 void	vfs_op_exit_locked(struct mount *);
 void	vfs_op_exit(struct mount *);
 
 #ifdef DIAGNOSTIC
 void	vfs_assert_mount_counters(struct mount *);
 void	vfs_dump_mount_counters(struct mount *);
 #else
 #define vfs_assert_mount_counters(mp) do { } while (0)
 #define vfs_dump_mount_counters(mp) do { } while (0)
 #endif
 
 enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT };
 int	vfs_mount_fetch_counter(struct mount *, enum mount_counter);
 
 /*
  * We mark ourselves as entering the section and post a sequentially consistent
  * fence, meaning the store is completed before we get into the section and
  * mnt_vfs_ops is only read afterwards.
  *
  * Any thread transitioning the ops counter 0->1 does things in the opposite
  * order - first bumps the count, posts a sequentially consistent fence and
  * observes all CPUs not executing within the section.
  *
  * This provides an invariant that by the time the last CPU is observed not
  * executing, everyone else entering will see the counter > 0 and exit.
  *
  * Note there is no barrier between vfs_ops and the rest of the code in the
  * section. It is not necessary as the writer has to wait for everyone to drain
  * before making any changes or only make changes safe while the section is
  * executed.
  */
 #define vfs_op_thread_entered(mp) ({				\
 	MPASS(curthread->td_critnest > 0);			\
 	*(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) == 1;	\
 })
 
 #define vfs_op_thread_enter(mp) ({				\
 	bool _retval = true;					\
 	critical_enter();					\
 	MPASS(!vfs_op_thread_entered(mp));			\
 	*(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 1;	\
 	atomic_thread_fence_seq_cst();				\
 	if (__predict_false(mp->mnt_vfs_ops > 0)) {		\
 		vfs_op_thread_exit(mp);				\
 		_retval = false;				\
 	}							\
 	_retval;						\
 })
 
 #define vfs_op_thread_exit(mp) do {				\
 	MPASS(vfs_op_thread_entered(mp));			\
 	atomic_thread_fence_rel();				\
 	*(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 0;	\
 	critical_exit();					\
 } while (0)
 
 #define vfs_mp_count_add_pcpu(mp, count, val) do {		\
 	MPASS(vfs_op_thread_entered(mp));			\
 	(*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) += val;	\
 } while (0)
 
 #define vfs_mp_count_sub_pcpu(mp, count, val) do {		\
 	MPASS(vfs_op_thread_entered(mp));			\
 	(*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) -= val;	\
 } while (0)
 
 #else /* !_KERNEL */
 
 #include <sys/cdefs.h>
 
 struct stat;
 
 __BEGIN_DECLS
 int	fhlink(struct fhandle *, const char *);
 int	fhlinkat(struct fhandle *, int, const char *);
 int	fhopen(const struct fhandle *, int);
 int	fhreadlink(struct fhandle *, char *, size_t);
 int	fhstat(const struct fhandle *, struct stat *);
 int	fhstatfs(const struct fhandle *, struct statfs *);
 int	fstatfs(int, struct statfs *);
 int	getfh(const char *, fhandle_t *);
 int	getfhat(int, char *, struct fhandle *, int);
 int	getfsstat(struct statfs *, long, int);
 int	getmntinfo(struct statfs **, int);
 int	lgetfh(const char *, fhandle_t *);
 int	mount(const char *, const char *, int, void *);
 int	nmount(struct iovec *, unsigned int, int);
 int	statfs(const char *, struct statfs *);
 int	unmount(const char *, int);
 
 /* C library stuff */
 int	getvfsbyname(const char *, struct xvfsconf *);
 __END_DECLS
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_MOUNT_H_ */
Index: projects/clang900-import/usr.bin/jot/jot.1
===================================================================
--- projects/clang900-import/usr.bin/jot/jot.1	(revision 352586)
+++ projects/clang900-import/usr.bin/jot/jot.1	(revision 352587)
@@ -1,328 +1,330 @@
 .\" Copyright (c) 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"	@(#)jot.1	8.1 (Berkeley) 6/6/93
 .\" $FreeBSD$
 .\"
-.Dd April 7, 2015
+.Dd September 21, 2019
 .Dt JOT 1
 .Os
 .Sh NAME
 .Nm jot
 .Nd print sequential or random data
 .Sh SYNOPSIS
 .Nm
 .Op Fl cnr
 .Op Fl b Ar word
 .Op Fl w Ar word
 .Op Fl s Ar string
 .Op Fl p Ar precision
 .Op Ar reps Op Ar begin Op Ar end Op Ar s
 .Sh DESCRIPTION
 The
 .Nm
 utility is used to print out increasing, decreasing, random,
 or redundant data, usually numbers, one per line.
 .Pp
 The following options are available:
 .Bl -tag -width indent
 .It Fl r
 Generate random data instead of the default sequential data.
 .It Fl b Ar word
 Just print
 .Ar word
 repetitively.
 .It Fl w Ar word
 Print
 .Ar word
 with the generated data appended to it.
 Octal, hexadecimal, exponential,
 .Tn ASCII ,
 zero padded,
 and right-adjusted representations
 are possible by using the appropriate
 .Xr printf 3
 conversion specification inside
 .Ar word ,
 in which case the data are inserted rather than appended.
 .It Fl c
 This is an abbreviation for
 .Fl w Ar %c .
 .It Fl s Ar string
 Print data separated by
 .Ar string .
 Normally, newlines separate data.
 .It Fl n
 Do not print the final newline normally appended to the output.
 .It Fl p Ar precision
 Print only as many digits or characters of the data
 as indicated by the integer
 .Ar precision .
 In the absence of
 .Fl p ,
 the precision is the greater of the precisions of
 .Ar begin
 and
 .Ar end .
 The
 .Fl p
 option is overridden by whatever appears in a
 .Xr printf 3
 conversion following
 .Fl w .
 .El
 .Pp
 The last four arguments indicate, respectively,
 the number of data, the lower bound, the upper bound,
 and the step size or, for random data, the seed.
 While at least one of them must appear,
 any of the other three may be omitted, and
 will be considered as such if given as
 .Fl ""
 or as an empty string.
 Any three of these arguments determines the fourth.
 If four are specified and the given and computed values of
 .Ar reps
 conflict, the lower value is used.
-If fewer than three are specified, defaults are assigned
-left to right, except for
+If one or two are specified, defaults are assigned
+starting with
 .Ar s ,
-which assumes a default of 1 or -1 if both
+which assumes a default of 1 (or -1 if
 .Ar begin
 and
 .Ar end
-are given.
+specify a descending range).
+Then the default values are assigned to the leftmost omitted arguments until
+three arguments are set.
 .Pp
 Defaults for the four arguments are, respectively,
 100, 1, 100, and 1, except that when random data are requested,
 the seed,
 .Ar s ,
 is picked randomly.
 The
 .Ar reps
 argument is expected to be an unsigned integer,
 and if given as zero is taken to be infinite.
 The
 .Ar begin
 and
 .Ar end
 arguments may be given as real numbers or as characters
 representing the corresponding value in
 .Tn ASCII .
 The last argument must be a real number.
 .Pp
 Random numbers are obtained through
 .Xr arc4random 3
 when no seed is specified,
 and through
 .Xr random 3
 when a seed is given.
 When
 .Nm
 is asked to generate random integers or characters with begin
 and end values in the range of the random number generator function
 and no format is specified with one of the
 .Fl w ,
 .Fl b ,
 or
 .Fl p
 options,
 .Nm
 will arrange for all the values in the range to appear in the output
 with an equal probability.
 In all other cases be careful to ensure that the output format's
 rounding or truncation will not skew the distribution of output
 values in an unintended way.
 .Pp
 The name
 .Nm
 derives in part from
 .Nm iota ,
 a function in APL.
 .Ss Rounding and truncation
 The
 .Nm
 utility uses double precision floating point arithmetic internally.
 Before printing a number, it is converted depending on the output
 format used.
 .Pp
 If no output format is specified or the output format is a
 floating point format
 .Po
 .Sq E ,
 .Sq G ,
 .Sq e ,
 .Sq f ,
 or
 .Sq g
 .Pc ,
 the value is rounded using the
 .Xr printf 3
 function, taking into account the requested precision.
 .Pp
 If the output format is an integer format
 .Po
 .Sq D ,
 .Sq O ,
 .Sq U ,
 .Sq X ,
 .Sq c ,
 .Sq d ,
 .Sq i ,
 .Sq o ,
 .Sq u ,
 or
 .Sq x
 .Pc ,
 the value is converted to an integer value by truncation.
 .Pp
 As an illustration, consider the following command:
 .Bd -literal -offset indent
 $ jot 6 1 10 0.5
 1
 2
 2
 2
 3
 4
 .Ed
 .Pp
 By requesting an explicit precision of 1, the values generated before rounding
 can be seen.
 The .5 values are rounded down if the integer part is even,
 up otherwise.
 .Bd -literal -offset indent
 $ jot -p 1 6 1 10 0.5
 1.0
 1.5
 2.0
 2.5
 3.0
 3.5
 .Ed
 .Pp
 By offsetting the values slightly, the values generated by the following
 command are always rounded down:
 .Bd -literal -offset indent
 $ jot -p 0 6 .9999999999 10 0.5
 1
 1
 2
 2
 3
 3
 .Ed
 .Pp
 Another way of achieving the same result is to force truncation by
 specifying an integer format:
 .Bd -literal -offset indent
 $ jot -w %d 6 1 10 0.5
 .Ed
 .Sh EXIT STATUS
 .Ex -std
 .Sh EXAMPLES
 The command
 .Dl jot - 1 10
 .Pp
 prints the integers from 1 to 10,
 while the command
 .Dl jot 21 -1 1.00
 .Pp
 prints 21 evenly spaced numbers increasing from -1 to 1.
 The
 .Tn ASCII
 character set is generated with
 .Dl jot -c 128 0
 .Pp
 and the strings xaa through xaz with
 .Dl jot -w xa%c 26 a
 .Pp
 while 20 random 8-letter strings are produced with
 .Dl "jot -r -c 160 a z | rs -g 0 8"
 .Pp
 Infinitely many
 .Em yes Ns 's
 may be obtained through
 .Dl jot -b yes 0
 .Pp
 and thirty
 .Xr ed 1
 substitution commands applying to lines 2, 7, 12, etc.\& is
 the result of
 .Dl jot -w %ds/old/new/ 30 2 - 5
 .Pp
 The stuttering sequence 9, 9, 8, 8, 7, etc.\& can be
 produced by truncating the output precision and a suitable choice of step size,
 as in
 .Dl jot -w %d - 9.5 0 -.5
 .Pp
 and a file containing exactly 1024 bytes is created with
 .Dl jot -b x 512 > block
 .Pp
 Finally, to set tabs four spaces apart starting
 from column 10 and ending in column 132, use
 .Dl expand -`jot -s, - 10 132 4`
 .Pp
 and to print all lines 80 characters or longer,
 .Dl grep `jot -s \&"\&" -b \&. 80`
 .Sh DIAGNOSTICS
 The following diagnostic messages deserve special explanation:
 .Bl -diag
 .It "illegal or unsupported format '%s'"
 The requested conversion format specifier for
 .Xr printf 3
 was not of the form
 .Dl %[#][ ][{+,-}][0-9]*[.[0-9]*]?
 where
 .Dq ?\&
 must be one of
 .Dl [l]{d,i,o,u,x}
 or
 .Dl {c,e,f,g,D,E,G,O,U,X}
 .It "range error in conversion"
 A value to be printed fell outside the range of the data type
 associated with the requested output format.
 .It "too many conversions"
 More than one conversion format specifier has been supplied,
 but only one is allowed.
 .El
 .Sh SEE ALSO
 .Xr ed 1 ,
 .Xr expand 1 ,
 .Xr rs 1 ,
 .Xr seq 1 ,
 .Xr yes 1 ,
 .Xr arc4random 3 ,
 .Xr printf 3 ,
 .Xr random 3
 .Sh HISTORY
 The
 .Nm
 utility first appeared in
 .Bx 4.2 .
 .Sh AUTHORS
 .An John A. Kunze
Index: projects/clang900-import/usr.bin/quota/quota.c
===================================================================
--- projects/clang900-import/usr.bin/quota/quota.c	(revision 352586)
+++ projects/clang900-import/usr.bin/quota/quota.c	(revision 352587)
@@ -1,699 +1,699 @@
 /*
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Robert Elz at The University of Melbourne.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1980, 1990, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif
 
 #ifndef lint
 static const char sccsid[] = "from: @(#)quota.c	8.1 (Berkeley) 6/6/93";
 #endif /* not lint */
 
 /*
  * Disk quota reporting program.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/mount.h>
 #include <sys/socket.h>
 
 #include <rpc/rpc.h>
 #include <rpc/pmap_prot.h>
 #include <rpcsvc/rquota.h>
 
 #include <ufs/ufs/quota.h>
 
 #include <ctype.h>
 #include <err.h>
 #include <fstab.h>
 #include <grp.h>
 #include <libutil.h>
 #include <netdb.h>
 #include <pwd.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 
 static const char *qfextension[] = INITQFNAMES;
 
 struct quotause {
 	struct	quotause *next;
 	long	flags;
 	struct	dqblk dqblk;
 	char	fsname[MAXPATHLEN + 1];
 };
 
 static char *timeprt(int64_t seconds);
 static struct quotause *getprivs(long id, int quotatype);
 static void usage(void);
 static int showuid(u_long uid);
 static int showgid(u_long gid);
 static int showusrname(char *name);
 static int showgrpname(char *name);
 static int showquotas(int type, u_long id, const char *name);
 static void showrawquotas(int type, u_long id, struct quotause *qup);
 static void heading(int type, u_long id, const char *name, const char *tag);
 static int getufsquota(struct fstab *fs, struct quotause *qup, long id,
 	int quotatype);
 static int getnfsquota(struct statfs *fst, struct quotause *qup, long id,
 	int quotatype);
 static enum clnt_stat callaurpc(char *host, int prognum, int versnum, int procnum, 
 	xdrproc_t inproc, char *in, xdrproc_t outproc, char *out);
 static int alldigits(char *s);
 
 static int	hflag;
 static int	lflag;
 static int	rflag;
 static int	qflag;
 static int	vflag;
 static char	*filename = NULL;
 
 int
 main(int argc, char *argv[])
 {
 	int ngroups; 
 	gid_t mygid, gidset[NGROUPS];
 	int i, ch, gflag = 0, uflag = 0, errflag = 0;
 
 	while ((ch = getopt(argc, argv, "f:ghlrquv")) != -1) {
 		switch(ch) {
 		case 'f':
 			filename = optarg;
 			break;
 		case 'g':
 			gflag++;
 			break;
 		case 'h':
 			hflag++;
 			break;
 		case 'l':
 			lflag++;
 			break;
 		case 'q':
 			qflag++;
 			break;
 		case 'r':
 			rflag++;
 			break;
 		case 'u':
 			uflag++;
 			break;
 		case 'v':
 			vflag++;
 			break;
 		default:
 			usage();
 		}
 	}
 	argc -= optind;
 	argv += optind;
 	if (!uflag && !gflag)
 		uflag++;
 	if (argc == 0) {
 		if (uflag)
 			errflag += showuid(getuid());
 		if (gflag) {
 			mygid = getgid();
 			ngroups = getgroups(NGROUPS, gidset);
 			if (ngroups < 0)
 				err(1, "getgroups");
 			errflag += showgid(mygid);
 			for (i = 0; i < ngroups; i++)
 				if (gidset[i] != mygid)
 					errflag += showgid(gidset[i]);
 		}
 		return(errflag);
 	}
 	if (uflag && gflag)
 		usage();
 	if (uflag) {
 		for (; argc > 0; argc--, argv++) {
 			if (alldigits(*argv))
 				errflag += showuid(atoi(*argv));
 			else
 				errflag += showusrname(*argv);
 		}
 		return(errflag);
 	}
 	if (gflag) {
 		for (; argc > 0; argc--, argv++) {
 			if (alldigits(*argv))
 				errflag += showgid(atoi(*argv));
 			else
 				errflag += showgrpname(*argv);
 		}
 	}
 	return(errflag);
 }
 
 static void
 usage(void)
 {
 
 	fprintf(stderr, "%s\n%s\n%s\n",
 	    "usage: quota [-ghlu] [-f path] [-v | -q | -r]",
 	    "       quota [-hlu] [-f path] [-v | -q | -r] user ...",
 	    "       quota -g [-hl] [-f path] [-v | -q | -r] group ...");
 	exit(1);
 }
 
 /*
  * Print out quotas for a specified user identifier.
  */
 static int
 showuid(u_long uid)
 {
 	struct passwd *pwd = getpwuid(uid);
 	const char *name;
 
 	if (pwd == NULL)
 		name = "(no account)";
 	else
 		name = pwd->pw_name;
 	return(showquotas(USRQUOTA, uid, name));
 }
 
 /*
  * Print out quotas for a specifed user name.
  */
 static int
 showusrname(char *name)
 {
 	struct passwd *pwd = getpwnam(name);
 
 	if (pwd == NULL) {
 		warnx("%s: unknown user", name);
 		return(1);
 	}
 	return(showquotas(USRQUOTA, pwd->pw_uid, name));
 }
 
 /*
  * Print out quotas for a specified group identifier.
  */
 static int
 showgid(u_long gid)
 {
 	struct group *grp = getgrgid(gid);
 	const char *name;
 
 	if (grp == NULL)
 		name = "(no entry)";
 	else
 		name = grp->gr_name;
 	return(showquotas(GRPQUOTA, gid, name));
 }
 
 /*
  * Print out quotas for a specifed group name.
  */
 static int
 showgrpname(char *name)
 {
 	struct group *grp = getgrnam(name);
 
 	if (grp == NULL) {
 		warnx("%s: unknown group", name);
 		return(1);
 	}
 	return(showquotas(GRPQUOTA, grp->gr_gid, name));
 }
 
 static void
 prthumanval(int len, u_int64_t bytes)
 {
 	char buf[len + 1];
 
 	/*
 	 * Limit the width to 5 bytes as that is what users expect.
 	 */
 	humanize_number(buf, MIN(sizeof(buf), 5), bytes, "",
 			HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL);
 
 	(void)printf(" %*s", len, buf);
 }
 
 static int
 showquotas(int type, u_long id, const char *name)
 {
 	struct quotause *qup;
 	struct quotause *quplist;
 	const char *msgi, *msgb;
 	const char *nam;
 	char *bgrace = NULL, *igrace = NULL;
 	int lines = 0, overquota = 0;
 	static time_t now;
 
 	if (now == 0)
 		time(&now);
 	quplist = getprivs(id, type);
 	for (qup = quplist; qup; qup = qup->next) {
 		msgi = NULL;
 		if (qup->dqblk.dqb_ihardlimit &&
 		    qup->dqblk.dqb_curinodes >= qup->dqblk.dqb_ihardlimit) {
 			overquota++;
 			msgi = "File limit reached on";
 		}
 		else if (qup->dqblk.dqb_isoftlimit &&
 		    qup->dqblk.dqb_curinodes >= qup->dqblk.dqb_isoftlimit) {
 			overquota++;
 			if (qup->dqblk.dqb_itime > now)
 				msgi = "In file grace period on";
 			else
 				msgi = "Over file quota on";
 		}
 		msgb = NULL;
 		if (qup->dqblk.dqb_bhardlimit &&
 		    qup->dqblk.dqb_curblocks >= qup->dqblk.dqb_bhardlimit) {
 			overquota++;
 			msgb = "Block limit reached on";
 		}
 		else if (qup->dqblk.dqb_bsoftlimit &&
 		    qup->dqblk.dqb_curblocks >= qup->dqblk.dqb_bsoftlimit) {
 			overquota++;
 			if (qup->dqblk.dqb_btime > now)
 				msgb = "In block grace period on";
 			else
 				msgb = "Over block quota on";
 		}
 		if (rflag) {
 			showrawquotas(type, id, qup);
 			continue;
 		}
 		if (!vflag &&
 		    qup->dqblk.dqb_isoftlimit == 0 &&
 		    qup->dqblk.dqb_ihardlimit == 0 &&
 		    qup->dqblk.dqb_bsoftlimit == 0 &&
 		    qup->dqblk.dqb_bhardlimit == 0)
 			continue;
 		if (qflag) {
 			if ((msgi != NULL || msgb != NULL) &&
 			    lines++ == 0)
 				heading(type, id, name, "");
 			if (msgi != NULL)
 				printf("\t%s %s\n", msgi, qup->fsname);
 			if (msgb != NULL)
 				printf("\t%s %s\n", msgb, qup->fsname);
 			continue;
 		}
 		if (!vflag &&
 		    qup->dqblk.dqb_curblocks == 0 &&
 		    qup->dqblk.dqb_curinodes == 0)
 			continue;
 		if (lines++ == 0)
 			heading(type, id, name, "");
 		nam = qup->fsname;
 		if (strlen(qup->fsname) > 15) {
 			printf("%s\n", qup->fsname);
 			nam = "";
 		} 
 		printf("%-15s", nam);
 		if (hflag) {
 			prthumanval(7, dbtob(qup->dqblk.dqb_curblocks));
 			printf("%c", (msgb == NULL) ? ' ' : '*');
 			prthumanval(7, dbtob(qup->dqblk.dqb_bsoftlimit));
 			prthumanval(7, dbtob(qup->dqblk.dqb_bhardlimit));
 		} else {
 			printf(" %7ju%c %7ju %7ju",
 			    (uintmax_t)dbtob(qup->dqblk.dqb_curblocks)
 				/ 1024,
 			    (msgb == NULL) ? ' ' : '*',
 			    (uintmax_t)dbtob(qup->dqblk.dqb_bsoftlimit)
 				/ 1024,
 			    (uintmax_t)dbtob(qup->dqblk.dqb_bhardlimit)
 				/ 1024);
 		}
 		if (msgb != NULL)
 			bgrace = timeprt(qup->dqblk.dqb_btime);
 		if (msgi != NULL)
 			igrace = timeprt(qup->dqblk.dqb_itime);
 		printf("%8s %6ju%c %6ju %6ju%8s\n"
 			, (msgb == NULL) ? "" : bgrace
 			, (uintmax_t)qup->dqblk.dqb_curinodes
 			, (msgi == NULL) ? ' ' : '*'
 			, (uintmax_t)qup->dqblk.dqb_isoftlimit
 			, (uintmax_t)qup->dqblk.dqb_ihardlimit
 			, (msgi == NULL) ? "" : igrace
 		);
 		if (msgb != NULL)
 			free(bgrace);
 		if (msgi != NULL)
 			free(igrace);
 	}
 	if (!qflag && !rflag && lines == 0)
 		heading(type, id, name, "none");
 	return (overquota);
 }
 
 static void
 showrawquotas(int type, u_long id, struct quotause *qup)
 {
 	time_t t;
 
 	printf("Raw %s quota information for id %lu on %s\n",
 	    type == USRQUOTA ? "user" : "group", id, qup->fsname);
 	printf("block hard limit:     %ju\n",
 	    (uintmax_t)qup->dqblk.dqb_bhardlimit);
 	printf("block soft limit:     %ju\n",
 	    (uintmax_t)qup->dqblk.dqb_bsoftlimit);
 	printf("current block count:  %ju\n",
 	    (uintmax_t)qup->dqblk.dqb_curblocks);
 	printf("i-node hard limit:    %ju\n",
 	    (uintmax_t)qup->dqblk.dqb_ihardlimit);
 	printf("i-node soft limit:    %ju\n",
 	    (uintmax_t)qup->dqblk.dqb_isoftlimit);
 	printf("current i-node count: %ju\n",
 	    (uintmax_t)qup->dqblk.dqb_curinodes);
 	printf("block grace time:     %jd",
 	    (intmax_t)qup->dqblk.dqb_btime);
 	if (qup->dqblk.dqb_btime != 0) {
 		t = qup->dqblk.dqb_btime;
 		printf(" %s", ctime(&t));
 	} else {
 		printf("\n");
 	}
 	printf("i-node grace time:    %jd", (intmax_t)qup->dqblk.dqb_itime);
 	if (qup->dqblk.dqb_itime != 0) {
 		t = qup->dqblk.dqb_itime;
 		printf(" %s", ctime(&t));
 	} else {
 		printf("\n");
 	}
 }
 
 
 static void
 heading(int type, u_long id, const char *name, const char *tag)
 {
 
 	printf("Disk quotas for %s %s (%cid %lu): %s\n", qfextension[type],
 	    name, *qfextension[type], id, tag);
 	if (!qflag && tag[0] == '\0') {
 		printf("%-15s %7s %8s %7s %7s %6s %7s %6s%8s\n"
 			, "Filesystem"
 			, "usage"
 			, "quota"
 			, "limit"
 			, "grace"
 			, "files"
 			, "quota"
 			, "limit"
 			, "grace"
 		);
 	}
 }
 
 /*
  * Calculate the grace period and return a printable string for it.
  */
 static char *
 timeprt(int64_t seconds)
 {
 	time_t hours, minutes;
 	char *buf;
 	static time_t now;
 
 	if (now == 0)
 		time(&now);
 	if (now > seconds) {
 		if ((buf = strdup("none")) == NULL)
 			errx(1, "strdup() failed in timeprt()");
 		return (buf);
 	}
 	seconds -= now;
 	minutes = (seconds + 30) / 60;
 	hours = (minutes + 30) / 60;
 	if (hours >= 36) {
 		if (asprintf(&buf, "%lddays", ((long)hours + 12) / 24) < 0)
 			errx(1, "asprintf() failed in timeprt(1)");
 		return (buf);
 	}
 	if (minutes >= 60) {
 		if (asprintf(&buf, "%2ld:%ld", (long)minutes / 60,
 		    (long)minutes % 60) < 0)
 			errx(1, "asprintf() failed in timeprt(2)");
 		return (buf);
 	}
 	if (asprintf(&buf, "%2ld", (long)minutes) < 0)
 		errx(1, "asprintf() failed in timeprt(3)");
 	return (buf);
 }
 
 /*
  * Collect the requested quota information.
  */
 static struct quotause *
 getprivs(long id, int quotatype)
 {
 	struct quotause *qup, *quptail = NULL;
 	struct fstab *fs;
 	struct quotause *quphead;
 	struct statfs *fst;
 	int nfst, i;
 	struct statfs sfb;
 
 	qup = quphead = (struct quotause *)0;
 
 	if (filename != NULL && statfs(filename, &sfb) != 0)
 		err(1, "cannot statfs %s", filename);
 	nfst = getmntinfo(&fst, MNT_NOWAIT);
 	if (nfst == 0)
 		errx(2, "no filesystems mounted!");
 	setfsent();
 	for (i = 0; i < nfst; i++) {
 		if (qup == NULL) {
 			if ((qup = (struct quotause *)malloc(sizeof *qup))
 			    == NULL)
 				errx(2, "out of memory");
 		}
 		/*
 		 * See if the user requested a specific file system
 		 * or specified a file inside a mounted file system.
 		 */
 		if (filename != NULL &&
 		    strcmp(sfb.f_mntonname, fst[i].f_mntonname) != 0)
 			continue;
 		if (strcmp(fst[i].f_fstypename, "nfs") == 0) {
 			if (lflag)
 				continue;
 			if (getnfsquota(&fst[i], qup, id, quotatype) == 0)
 				continue;
 		} else if (strcmp(fst[i].f_fstypename, "ufs") == 0) {
 			/*
 			 * XXX
 			 * UFS filesystems must be in /etc/fstab, and must
 			 * indicate that they have quotas on (?!) This is quite
 			 * unlike SunOS where quotas can be enabled/disabled
 			 * on a filesystem independent of /etc/fstab, and it
 			 * will still print quotas for them.
 			 */
 			if ((fs = getfsspec(fst[i].f_mntfromname)) == NULL)
 				continue;
 			if (getufsquota(fs, qup, id, quotatype) == 0)
 				continue;
 		} else
 			continue;
 		strcpy(qup->fsname, fst[i].f_mntonname);
 		if (quphead == NULL)
 			quphead = qup;
 		else
 			quptail->next = qup;
 		quptail = qup;
 		quptail->next = 0;
 		qup = NULL;
 	}
 	if (qup)
 		free(qup);
 	endfsent();
 	return (quphead);
 }
 
 /*
  * Check to see if a particular quota is available.
  */
 static int
 getufsquota(struct fstab *fs, struct quotause *qup, long id, int quotatype)
 {
 	struct quotafile *qf;
 
 	if ((qf = quota_open(fs, quotatype, O_RDONLY)) == NULL)
 		return (0);
 	if (quota_read(qf, &qup->dqblk, id) != 0)
 		return (0);
 	quota_close(qf);
 	return (1);
 }
 
 static int
 getnfsquota(struct statfs *fst, struct quotause *qup, long id, int quotatype)
 {
 	struct ext_getquota_args gq_args;
 	struct getquota_args old_gq_args;
 	struct getquota_rslt gq_rslt;
 	struct dqblk *dqp = &qup->dqblk;
 	struct timeval tv;
 	char *cp, host[NI_MAXHOST];
 	enum clnt_stat call_stat;
 
 	if (fst->f_flags & MNT_LOCAL)
 		return (0);
 
 	/*
 	 * must be some form of "hostname:/path"
 	 */
 	cp = fst->f_mntfromname;
 	do {
 		cp = strrchr(cp, ':');
 	} while (cp != NULL && *(cp + 1) != '/');
 	if (cp == NULL) {
 		warnx("cannot find hostname for %s", fst->f_mntfromname);
 		return (0);
 	}
 	memset(host, 0, sizeof(host));
 	memcpy(host, fst->f_mntfromname, cp - fst->f_mntfromname);
 	host[sizeof(host) - 1] = '\0';
  
 	/* Avoid attempting the RPC for special amd(8) filesystems. */
 	if (strncmp(fst->f_mntfromname, "pid", 3) == 0 &&
 	    strchr(fst->f_mntfromname, '@') != NULL)
 		return (0);
 
 	gq_args.gqa_pathp = cp + 1;
 	gq_args.gqa_id = id;
 	gq_args.gqa_type = quotatype;
 
 	call_stat = callaurpc(host, RQUOTAPROG, EXT_RQUOTAVERS,
 			      RQUOTAPROC_GETQUOTA, (xdrproc_t)xdr_ext_getquota_args, (char *)&gq_args,
 			      (xdrproc_t)xdr_getquota_rslt, (char *)&gq_rslt);
-	if (call_stat == RPC_PROGVERSMISMATCH) {
+	if (call_stat == RPC_PROGVERSMISMATCH || call_stat == RPC_PROGNOTREGISTERED) {
 		if (quotatype == USRQUOTA) {
 			old_gq_args.gqa_pathp = cp + 1;
 			old_gq_args.gqa_uid = id;
 			call_stat = callaurpc(host, RQUOTAPROG, RQUOTAVERS,
 					      RQUOTAPROC_GETQUOTA, (xdrproc_t)xdr_getquota_args, (char *)&old_gq_args,
 					      (xdrproc_t)xdr_getquota_rslt, (char *)&gq_rslt);
 		} else {
 			/* Old rpc quota does not support group type */
 			return (0);
 		}
 	}
 	if (call_stat != 0)
 		return (call_stat);
 
 	switch (gq_rslt.status) {
 	case Q_NOQUOTA:
 		break;
 	case Q_EPERM:
 		warnx("quota permission error, host: %s",
 			fst->f_mntfromname);
 		break;
 	case Q_OK:
 		gettimeofday(&tv, NULL);
 			/* blocks*/
 		dqp->dqb_bhardlimit =
 		    ((uint64_t)gq_rslt.getquota_rslt_u.gqr_rquota.rq_bhardlimit *
 		    gq_rslt.getquota_rslt_u.gqr_rquota.rq_bsize) / DEV_BSIZE;
 		dqp->dqb_bsoftlimit =
 		    ((uint64_t)gq_rslt.getquota_rslt_u.gqr_rquota.rq_bsoftlimit *
 		    gq_rslt.getquota_rslt_u.gqr_rquota.rq_bsize) / DEV_BSIZE;
 		dqp->dqb_curblocks =
 		    ((uint64_t)gq_rslt.getquota_rslt_u.gqr_rquota.rq_curblocks *
 		    gq_rslt.getquota_rslt_u.gqr_rquota.rq_bsize) / DEV_BSIZE;
 			/* inodes */
 		dqp->dqb_ihardlimit =
 			gq_rslt.getquota_rslt_u.gqr_rquota.rq_fhardlimit;
 		dqp->dqb_isoftlimit =
 			gq_rslt.getquota_rslt_u.gqr_rquota.rq_fsoftlimit;
 		dqp->dqb_curinodes =
 			gq_rslt.getquota_rslt_u.gqr_rquota.rq_curfiles;
 			/* grace times */
 		dqp->dqb_btime =
 		    tv.tv_sec + gq_rslt.getquota_rslt_u.gqr_rquota.rq_btimeleft;
 		dqp->dqb_itime =
 		    tv.tv_sec + gq_rslt.getquota_rslt_u.gqr_rquota.rq_ftimeleft;
 		return (1);
 	default:
 		warnx("bad rpc result, host: %s", fst->f_mntfromname);
 		break;
 	}
 
 	return (0);
 }
  
 static enum clnt_stat
 callaurpc(char *host, int prognum, int versnum, int procnum,
     xdrproc_t inproc, char *in, xdrproc_t outproc, char *out)
 {
 	enum clnt_stat clnt_stat;
 	struct timeval timeout, tottimeout;
  
 	CLIENT *client = NULL;
 
  	client = clnt_create(host, prognum, versnum, "udp");
 	if (client == NULL)
 		return ((int)rpc_createerr.cf_stat);
 	timeout.tv_usec = 0;
 	timeout.tv_sec = 6;
 	CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, (char *)(void *)&timeout);
 
 	client->cl_auth = authunix_create_default();
 	tottimeout.tv_sec = 25;
 	tottimeout.tv_usec = 0;
 	clnt_stat = clnt_call(client, procnum, inproc, in,
 	    outproc, out, tottimeout);
 	return (clnt_stat);
 }
 
 static int
 alldigits(char *s)
 {
 	int c;
 
 	c = *s++;
 	do {
 		if (!isdigit(c))
 			return (0);
 	} while ((c = *s++));
 	return (1);
 }
Index: projects/clang900-import/usr.bin/top/display.c
===================================================================
--- projects/clang900-import/usr.bin/top/display.c	(revision 352586)
+++ projects/clang900-import/usr.bin/top/display.c	(revision 352587)
@@ -1,1379 +1,1354 @@
 /*
  *  Top users/processes display for Unix
  *  Version 3
  *
  *  This program may be freely redistributed,
  *  but this entire comment MUST remain intact.
  *
  *  Copyright (c) 1984, 1989, William LeFebvre, Rice University
  *  Copyright (c) 1989, 1990, 1992, William LeFebvre, Northwestern University
  *
  * $FreeBSD$
  */
 
 /*
  *  This file contains the routines that display information on the screen.
  *  Each section of the screen has two routines:  one for initially writing
  *  all constant and dynamic text, and one for only updating the text that
  *  changes.  The prefix "i_" is used on all the "initial" routines and the
  *  prefix "u_" is used for all the "updating" routines.
  *
  *  ASSUMPTIONS:
  *        None of the "i_" routines use any of the termcap capabilities.
  *        In this way, those routines can be safely used on terminals that
  *        have minimal (or nonexistant) terminal capabilities.
  *
  *        The routines are called in this order:  *_loadave, i_timeofday,
  *        *_procstates, *_cpustates, *_memory, *_message, *_header,
  *        *_process, u_endscreen.
  */
 
 #include <sys/cdefs.h>
 #include <sys/resource.h>
 #include <sys/time.h>
 
 #include <assert.h>
 #include <ctype.h>
 #include <err.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <termcap.h>
 #include <time.h>
 #include <unistd.h>
 
 #include "screen.h"		/* interface to screen package */
 #include "layout.h"		/* defines for screen position layout */
 #include "display.h"
 #include "top.h"
 #include "machine.h"		/* we should eliminate this!!! */
 #include "utils.h"
 
 #ifdef DEBUG
 FILE *debug;
 #endif
 
 static int lmpid = 0;
 static int last_hi = 0;		/* used in u_process and u_endscreen */
 static int lastline = 0;
 
 #define lineindex(l) ((l)*screen_width)
 
 
 /* things initialized by display_init and used thruout */
 
 /* buffer of proc information lines for display updating */
 static char *screenbuf = NULL;
 
 static const char * const *procstate_names;
 static const char * const *cpustate_names;
 static const char * const *memory_names;
 static const char * const *arc_names;
 static const char * const *carc_names;
 static const char * const *swap_names;
 
 static int num_procstates;
 static int num_cpustates;
 static int num_memory;
 static int num_swap;
 
 static int *lprocstates;
 static int *lcpustates;
 static int *lmemory;
 static int *lswap;
 
 static int num_cpus;
 static int *cpustate_columns;
 static int cpustate_total_length;
 static int cpustates_column;
 
 static enum { OFF, ON, ERASE } header_status = ON;
 
 static void summary_format(char *, int *, const char * const *);
 static void line_update(char *, char *, int, int);
 
 static int setup_buffer_bufsiz = 0;
 static char * setup_buffer(char *, int);
 
 int  x_lastpid =	10;
 int  y_lastpid =	0;
 int  x_loadave =	33;
 int  x_loadave_nompid =	15;
 int  y_loadave =	0;
 int  x_procstate =	0;
 int  y_procstate =	1;
 int  x_brkdn =		15;
 int  y_brkdn =		1;
 int  x_mem =		5;
 int  y_mem =		3;
 int  x_arc =		5;
 int  y_arc =		4;
 int  x_carc =		5;
 int  y_carc =		5;
 int  x_swap =		6;
 int  y_swap =		4;
 int  y_message =	5;
 int  x_header =		0;
 int  y_header =		6;
 int  x_idlecursor =	0;
 int  y_idlecursor =	5;
 int  y_procs =		7;
 
 int  y_cpustates =	2;
 int  Header_lines =	7;
 
 int
 display_resize(void)
 {
     int lines;
 
     /* first, deallocate any previous buffer that may have been there */
     if (screenbuf != NULL)
     {
 	free(screenbuf);
     }
 
     /* calculate the current dimensions */
     /* if operating in "dumb" mode, we only need one line */
     lines = smart_terminal ? screen_length - Header_lines : 1;
 
     if (lines < 0)
 	lines = 0;
 
     /* now, allocate space for the screen buffer */
     screenbuf = calloc(lines, screen_width);
     if (screenbuf == NULL)
     {
 	/* oops! */
 	return(-1);
     }
 
     /* return number of lines available */
     /* for dumb terminals, pretend like we can show any amount */
     return(smart_terminal ? lines : Largest);
 }
 
 int
 display_updatecpus(struct statics *statics)
 {
     int lines;
     int i;
 
     /* call resize to do the dirty work */
     lines = display_resize();
     if (pcpu_stats)
 		num_cpus = statics->ncpus;
     else
 		num_cpus = 1;
     cpustates_column = 5;	/* CPU: */
     if (num_cpus > 1) {
 		cpustates_column += 1 + digits(num_cpus); /* CPU #: */
 	}
 
     /* fill the "last" array with all -1s, to insure correct updating */
 	for (i = 0; i < num_cpustates * num_cpus; ++i) {
 		lcpustates[i] = -1;
     }
 
     return(lines);
 }
 
 int
 display_init(struct statics * statics)
 {
     int lines;
     const char * const *pp;
     int *ip;
     int i;
 
     lines = display_updatecpus(statics);
 
     /* only do the rest if we need to */
     if (lines > -1)
     {
 	/* save pointers and allocate space for names */
 	procstate_names = statics->procstate_names;
 	num_procstates = 8;
 	assert(num_procstates > 0);
 	lprocstates = calloc(num_procstates, sizeof(int));
 
 	cpustate_names = statics->cpustate_names;
 
 	swap_names = statics->swap_names;
 	num_swap = 7;
 	assert(num_swap > 0);
 	lswap = calloc(num_swap, sizeof(int));
 	num_cpustates = CPUSTATES;
 	assert(num_cpustates > 0);
 	lcpustates = calloc(num_cpustates * sizeof(int), statics->ncpus);
 	cpustate_columns = calloc(num_cpustates, sizeof(int));
 
 	memory_names = statics->memory_names;
 	num_memory = 7;
 	assert(num_memory > 0);
 	lmemory = calloc(num_memory, sizeof(int));
 
 	arc_names = statics->arc_names;
 	carc_names = statics->carc_names;
 
 	/* calculate starting columns where needed */
 	cpustate_total_length = 0;
 	pp = cpustate_names;
 	ip = cpustate_columns;
 	while (*pp != NULL)
 	{
 	    *ip++ = cpustate_total_length;
 	    if ((i = strlen(*pp++)) > 0)
 	    {
 		cpustate_total_length += i + 8;
 	    }
 	}
     }
 
     /* return number of lines available */
     return(lines);
 }
 
 void
 i_loadave(int mpid, double avenrun[])
 {
     int i;
 
     /* i_loadave also clears the screen, since it is first */
     top_clear();
 
     /* mpid == -1 implies this system doesn't have an _mpid */
     if (mpid != -1)
     {
 	printf("last pid: %5d;  ", mpid);
     }
 
     printf("load averages");
 
     for (i = 0; i < 3; i++)
     {
 	printf("%c %5.2f",
 	    i == 0 ? ':' : ',',
 	    avenrun[i]);
     }
     lmpid = mpid;
 }
 
 void
 u_loadave(int mpid, double *avenrun)
 {
     int i;
 
     if (mpid != -1)
     {
 	/* change screen only when value has really changed */
 	if (mpid != lmpid)
 	{
 	    Move_to(x_lastpid, y_lastpid);
 	    printf("%5d", mpid);
 	    lmpid = mpid;
 	}
 
 	/* i remembers x coordinate to move to */
 	i = x_loadave;
     }
     else
     {
 	i = x_loadave_nompid;
     }
 
     /* move into position for load averages */
     Move_to(i, y_loadave);
 
     /* display new load averages */
     /* we should optimize this and only display changes */
     for (i = 0; i < 3; i++)
     {
 	printf("%s%5.2f",
 	    i == 0 ? "" : ", ",
 	    avenrun[i]);
     }
 }
 
 void
 i_timeofday(time_t *tod)
 {
     /*
      *  Display the current time.
      *  "ctime" always returns a string that looks like this:
      *
      *	Sun Sep 16 01:03:52 1973
      *      012345678901234567890123
      *	          1         2
      *
      *  We want indices 11 thru 18 (length 8).
      */
 
     if (smart_terminal)
     {
 	Move_to(screen_width - 8, 0);
     }
     else
     {
 	fputs("    ", stdout);
     }
 #ifdef DEBUG
     {
 	char *foo;
 	foo = ctime(tod);
 	fputs(foo, stdout);
     }
 #endif
     printf("%-8.8s\n", &(ctime(tod)[11]));
     lastline = 1;
 }
 
 static int ltotal = 0;
 static char *procstates_buffer = NULL;
 
 /*
  *  *_procstates(total, brkdn, names) - print the process summary line
  *
  *  Assumptions:  cursor is at the beginning of the line on entry
  *		  lastline is valid
  */
 
 void
 i_procstates(int total, int *brkdn)
 {
     int i;
 
     procstates_buffer = setup_buffer(procstates_buffer, 0);
 
     /* write current number of processes and remember the value */
     printf("%d %s:", total, ps.thread ? "threads" : "processes");
     ltotal = total;
 
     /* put out enough spaces to get to column 15 */
     i = digits(total);
     while (i++ < (ps.thread ? 6 : 4))
     {
 	putchar(' ');
     }
 
     /* format and print the process state summary */
     summary_format(procstates_buffer, brkdn, procstate_names);
     fputs(procstates_buffer, stdout);
 
     /* save the numbers for next time */
     memcpy(lprocstates, brkdn, num_procstates * sizeof(int));
 }
 
 void
 u_procstates(int total, int *brkdn)
 {
     static char *new = NULL;
     int i;
 
     new = setup_buffer(new, 0);
 
     /* update number of processes only if it has changed */
     if (ltotal != total)
     {
 	/* move and overwrite */
 	if (x_procstate == 0) {
 	    Move_to(x_procstate, y_procstate);
 	}
 	else {
 	    /* cursor is already there...no motion needed */
 	    assert(lastline == 1);
 	}
 	printf("%d", total);
 
 	/* if number of digits differs, rewrite the label */
 	if (digits(total) != digits(ltotal))
 	{
 	    printf(" %s:", ps.thread ? "threads" : "processes");
 	    /* put out enough spaces to get to column 15 */
 	    i = digits(total);
 	    while (i++ < (ps.thread ? 6 : 4))
 	    {
 		putchar(' ');
 	    }
 	    /* cursor may end up right where we want it!!! */
 	}
 
 	/* save new total */
 	ltotal = total;
     }
 
     /* see if any of the state numbers has changed */
     if (memcmp(lprocstates, brkdn, num_procstates * sizeof(int)) != 0)
     {
 	/* format and update the line */
 	summary_format(new, brkdn, procstate_names);
 	line_update(procstates_buffer, new, x_brkdn, y_brkdn);
 	memcpy(lprocstates, brkdn, num_procstates * sizeof(int));
     }
 }
 
 void
 i_cpustates(int *states)
 {
     int i = 0;
     int value;
     const char * const *names;
     const char *thisname;
     int *hstates = states;
     int cpu;
 
 for (cpu = 0; cpu < num_cpus; cpu++) {
     names = cpustate_names;
 
     /* print tag and bump lastline */
     if (num_cpus == 1)
 	printf("\nCPU: ");
     else {
 	value = printf("\nCPU %d: ", cpu);
 	while (value++ <= cpustates_column)
 		printf(" ");
     }
     lastline++;
 
     /* now walk thru the names and print the line */
     while ((thisname = *names++) != NULL)
     {
 	if (*thisname != '\0')
 	{
 	    /* retrieve the value and remember it */
 	    value = *states++;
 
 	    /* if percentage is >= 1000, print it as 100% */
 	    printf((value >= 1000 ? "%s%4.0f%% %s" : "%s%4.1f%% %s"),
 		   (i++ % num_cpustates) == 0 ? "" : ", ",
 		   ((float)value)/10.,
 		   thisname);
 	}
     }
 }
 
     /* copy over values into "last" array */
     states = hstates;
     memcpy(lcpustates, states, num_cpustates * sizeof(int) * num_cpus);
 }
 
 void
 u_cpustates(int *states)
 {
     int value;
     const char * const *names;
     const char *thisname;
     int *hstates = states;
     int *lp;
     int *colp;
     int cpu;
 
 for (cpu = 0; cpu < num_cpus; cpu++) {
     names = cpustate_names;
 
     Move_to(cpustates_column, y_cpustates + cpu);
     lastline = y_cpustates + cpu;
     lp = lcpustates + (cpu * num_cpustates);
     colp = cpustate_columns;
 
     /* we could be much more optimal about this */
     while ((thisname = *names++) != NULL)
     {
 	if (*thisname != '\0')
 	{
 	    /* did the value change since last time? */
 	    if (*lp != *states)
 	    {
 		/* yes, move and change */
 		Move_to(cpustates_column + *colp, y_cpustates + cpu);
 		lastline = y_cpustates + cpu;
 
 		/* retrieve value and remember it */
 		value = *states;
 
 		/* if percentage is >= 1000, print it as 100% */
 		printf((value >= 1000 ? "%4.0f" : "%4.1f"),
 		       ((double)value)/10.);
 
 		/* remember it for next time */
 		*lp = value;
 	    }
 	}
 
 	/* increment and move on */
 	lp++;
 	states++;
 	colp++;
     }
 }
 
     states = hstates;
 }
 
 void
 z_cpustates(void)
 {
     int i = 0;
     const char * const *names;
     const char *thisname;
     int cpu, value;
 
     for (cpu = 0; cpu < num_cpus; cpu++) {
 	    names = cpustate_names;
 
 	    /* show tag and bump lastline */
 	    if (num_cpus == 1)
 		    printf("\nCPU: ");
 	    else {
 		    value = printf("\nCPU %d: ", cpu);
 		    while (value++ <= cpustates_column)
 			    printf(" ");
 	    }
 	    lastline++;
 
 	    while ((thisname = *names++) != NULL)
 	    {
 		    if (*thisname != '\0')
 		    {
 			    printf("%s    %% %s", (i++ % num_cpustates) == 0 ? "" : ", ", thisname);
 		    }
 	    }
     }
 
     /* fill the "last" array with all -1s, to insure correct updating */
 	for (i = 0; i < num_cpustates * num_cpus; ++i) {
 		lcpustates[i] = -1;
     }
 }
 
 /*
  *  *_memory(stats) - print "Memory: " followed by the memory summary string
  *
  *  Assumptions:  cursor is on "lastline"
  *                for i_memory ONLY: cursor is on the previous line
  */
 
 static char *memory_buffer = NULL;
 
 void
 i_memory(int *stats)
 {
     memory_buffer = setup_buffer(memory_buffer, 0);
 
     fputs("\nMem: ", stdout);
     lastline++;
 
     /* format and print the memory summary */
     summary_format(memory_buffer, stats, memory_names);
     fputs(memory_buffer, stdout);
 }
 
 void
 u_memory(int *stats)
 {
     static char *new = NULL;
 
     new = setup_buffer(new, 0);
 
     /* format the new line */
     summary_format(new, stats, memory_names);
     line_update(memory_buffer, new, x_mem, y_mem);
 }
 
 /*
  *  *_arc(stats) - print "ARC: " followed by the ARC summary string
  *
  *  Assumptions:  cursor is on "lastline"
  *                for i_arc ONLY: cursor is on the previous line
  */
 static char *arc_buffer = NULL;
 
 void
 i_arc(int *stats)
 {
     arc_buffer = setup_buffer(arc_buffer, 0);
 
     if (arc_names == NULL)
 	return;
 
     fputs("\nARC: ", stdout);
     lastline++;
 
     /* format and print the memory summary */
     summary_format(arc_buffer, stats, arc_names);
     fputs(arc_buffer, stdout);
 }
 
 void
 u_arc(int *stats)
 {
     static char *new = NULL;
 
     new = setup_buffer(new, 0);
 
     if (arc_names == NULL)
 	return;
 
     /* format the new line */
     summary_format(new, stats, arc_names);
     line_update(arc_buffer, new, x_arc, y_arc);
 }
 
 
 /*
  *  *_carc(stats) - print "Compressed ARC: " followed by the summary string
  *
  *  Assumptions:  cursor is on "lastline"
  *                for i_carc ONLY: cursor is on the previous line
  */
 static char *carc_buffer = NULL;
 
 void
 i_carc(int *stats)
 {
     carc_buffer = setup_buffer(carc_buffer, 0);
 
     if (carc_names == NULL)
 	return;
 
     fputs("\n     ", stdout);
     lastline++;
 
     /* format and print the memory summary */
     summary_format(carc_buffer, stats, carc_names);
     fputs(carc_buffer, stdout);
 }
 
 void
 u_carc(int *stats)
 {
     static char *new = NULL;
 
     new = setup_buffer(new, 0);
 
     if (carc_names == NULL)
 	return;
 
     /* format the new line */
     summary_format(new, stats, carc_names);
     line_update(carc_buffer, new, x_carc, y_carc);
 }
 
 /*
  *  *_swap(stats) - print "Swap: " followed by the swap summary string
  *
  *  Assumptions:  cursor is on "lastline"
  *                for i_swap ONLY: cursor is on the previous line
  */
 
 static char *swap_buffer = NULL;
 
 void
 i_swap(int *stats)
 {
     swap_buffer = setup_buffer(swap_buffer, 0);
 
     if (swap_names == NULL)
 	    return;
 
     fputs("\nSwap: ", stdout);
     lastline++;
 
     /* format and print the swap summary */
     summary_format(swap_buffer, stats, swap_names);
     fputs(swap_buffer, stdout);
 }
 
 void
 u_swap(int *stats)
 {
     static char *new = NULL;
 
     new = setup_buffer(new, 0);
 
     if (swap_names == NULL)
 	    return;
 
     /* format the new line */
     summary_format(new, stats, swap_names);
     line_update(swap_buffer, new, x_swap, y_swap);
 }
 
 /*
  *  *_message() - print the next pending message line, or erase the one
  *                that is there.
  *
  *  Note that u_message is (currently) the same as i_message.
  *
  *  Assumptions:  lastline is consistent
  */
 
 /*
  *  i_message is funny because it gets its message asynchronously (with
  *	respect to screen updates).
  */
 
 #define NEXT_MSG_ADDLEN 5
 static char *next_msg = NULL;
 static int msglen = 0;
 /* Invariant: msglen is always the length of the message currently displayed
    on the screen (even when next_msg doesn't contain that message). */
 
 void
 i_message(void)
 {
     next_msg = setup_buffer(next_msg, NEXT_MSG_ADDLEN);
 
     while (lastline < y_message)
     {
 	fputc('\n', stdout);
 	lastline++;
     }
     if (next_msg[0] != '\0')
     {
 	top_standout(next_msg);
 	msglen = strlen(next_msg);
 	next_msg[0] = '\0';
     }
     else if (msglen > 0)
     {
 	(void) clear_eol(msglen);
 	msglen = 0;
     }
 }
 
 void
 u_message(void)
 {
     i_message();
 }
 
 static int header_length;
 
 /*
  * Trim a header string to the current display width and return a newly
  * allocated area with the trimmed header.
  */
 
 char *
 trim_header(const char *text)
 {
 	char *s;
 	int width;
 
 	s = NULL;
 	width = screen_width;
 	header_length = strlen(text);
 	if (header_length >= width) {
 		s = strndup(text, width);
 		if (s == NULL)
 			return (NULL);
 	}
 	return (s);
 }
 
 /*
  *  *_header(text) - print the header for the process area
  *
  *  Assumptions:  cursor is on the previous line and lastline is consistent
  */
 
 void
 i_header(const char *text)
 {
     char *s;
 
     s = trim_header(text);
     if (s != NULL)
 	text = s;
 
     if (header_status == ON)
     {
 	putchar('\n');
 	fputs(text, stdout);
 	lastline++;
     }
     else if (header_status == ERASE)
     {
 	header_status = OFF;
     }
     free(s);
 }
 
 void
 u_header(const char *text __unused)
 {
 
     if (header_status == ERASE)
     {
 	putchar('\n');
 	lastline++;
 	clear_eol(header_length);
 	header_status = OFF;
     }
 }
 
 /*
  *  *_process(line, thisline) - print one process line
  *
  *  Assumptions:  lastline is consistent
  */
 
 void
 i_process(int line, char *thisline)
 {
     char *p;
     char *base;
 
     /* make sure we are on the correct line */
     while (lastline < y_procs + line)
     {
 	putchar('\n');
 	lastline++;
     }
 
     /* truncate the line to conform to our current screen width */
     int len = strlen(thisline);
     if (screen_width < len)
     {
 	thisline[screen_width] = '\0';
     }
 
     /* write the line out */
     fputs(thisline, stdout);
 
     /* copy it in to our buffer */
     base = smart_terminal ? screenbuf + lineindex(line) : screenbuf;
     p = stpcpy(base, thisline);
 
     /* zero fill the rest of it */
     if (p - base < screen_width)
     {
 	memset(p, 0, screen_width - (p - base));
     }
 }
 
 void
 u_process(int line, char *newline)
 {
     char *optr;
     int screen_line = line + Header_lines;
     char *bufferline;
 
     /* remember a pointer to the current line in the screen buffer */
     bufferline = &screenbuf[lineindex(line)];
 
     /* truncate the line to conform to our current screen width */
     int len = strlen(newline);
     if (screen_width < len)
     {
 	newline[screen_width] = '\0';
     }
 
     /* is line higher than we went on the last display? */
     if (line >= last_hi)
     {
 	/* yes, just ignore screenbuf and write it out directly */
 	/* get positioned on the correct line */
 	if (screen_line - lastline == 1)
 	{
 	    putchar('\n');
 	    lastline++;
 	}
 	else
 	{
 	    Move_to(0, screen_line);
 	    lastline = screen_line;
 	}
 
 	/* now write the line */
 	fputs(newline, stdout);
 
 	/* copy it in to the buffer */
 	optr = stpcpy(bufferline, newline);
 
 	/* zero fill the rest of it */
 	if (optr - bufferline < screen_width)
 	{
 	    memset(optr, 0, screen_width - (optr - bufferline));
 	}
     }
     else
     {
 	line_update(bufferline, newline, 0, line + Header_lines);
     }
 }
 
 void
 u_endscreen(int hi)
 {
     int screen_line = hi + Header_lines;
     int i;
 
     if (smart_terminal)
     {
 	if (hi < last_hi)
 	{
 	    /* need to blank the remainder of the screen */
 	    /* but only if there is any screen left below this line */
 	    if (lastline + 1 < screen_length)
 	    {
 		/* efficiently move to the end of currently displayed info */
 		if (screen_line - lastline < 5)
 		{
 		    while (lastline < screen_line)
 		    {
 			putchar('\n');
 			lastline++;
 		    }
 		}
 		else
 		{
 		    Move_to(0, screen_line);
 		    lastline = screen_line;
 		}
 
 		if (clear_to_end)
 		{
 		    /* we can do this the easy way */
 		    putcap(clear_to_end);
 		}
 		else
 		{
 		    /* use clear_eol on each line */
 		    i = hi;
 		    while ((void) clear_eol(strlen(&screenbuf[lineindex(i++)])), i < last_hi)
 		    {
 			putchar('\n');
 		    }
 		}
 	    }
 	}
 	last_hi = hi;
 
 	/* move the cursor to a pleasant place */
 	Move_to(x_idlecursor, y_idlecursor);
 	lastline = y_idlecursor;
     }
     else
     {
 	/* separate this display from the next with some vertical room */
 	fputs("\n\n", stdout);
     }
 }
 
 void
 display_header(int t)
 {
 
     if (t)
     {
 	header_status = ON;
     }
     else if (header_status == ON)
     {
 	header_status = ERASE;
     }
 }
 
 void
 new_message(int type, const char *msgfmt, ...)
 {
     va_list args;
     size_t i;
 
     va_start(args, msgfmt);
 
     /* first, format the message */
     vsnprintf(next_msg, setup_buffer_bufsiz + NEXT_MSG_ADDLEN,
 		    msgfmt, args);
 
     va_end(args);
 
     if (msglen > 0)
     {
 	/* message there already -- can we clear it? */
 	if (!overstrike)
 	{
 	    /* yes -- write it and clear to end */
 	    i = strlen(next_msg);
 	    if ((type & MT_delayed) == 0)
 	    {
 			if (type & MT_standout) {
 				top_standout(next_msg);
 			} else {
 				fputs(next_msg, stdout);
 			}
 			clear_eol(msglen - i);
 			msglen = i;
 			next_msg[0] = '\0';
 	    }
 	}
     }
     else
     {
 	if ((type & MT_delayed) == 0)
 	{
 		if (type & MT_standout) {
 			top_standout(next_msg);
 		} else {
 			fputs(next_msg, stdout);
 		}
 	    msglen = strlen(next_msg);
 	    next_msg[0] = '\0';
 	}
     }
 }
 
 void
 clear_message(void)
 {
     if (clear_eol(msglen) == 1)
     {
 	putchar('\r');
     }
 }
 
 int
 readline(char *buffer, int size, int numeric)
 {
     char *ptr = buffer;
     char ch;
     char cnt = 0;
     char maxcnt = 0;
 
     /* allow room for null terminator */
     size -= 1;
 
     /* read loop */
     while ((fflush(stdout), read(0, ptr, 1) > 0))
     {
 	/* newline means we are done */
 	if ((ch = *ptr) == '\n' || ch == '\r')
 	{
 	    break;
 	}
 
 	/* handle special editing characters */
 	if (ch == ch_kill)
 	{
 	    /* kill line -- account for overstriking */
 	    if (overstrike)
 	    {
 		msglen += maxcnt;
 	    }
 
 	    /* return null string */
 	    *buffer = '\0';
 	    putchar('\r');
 	    return(-1);
 	}
 	else if (ch == ch_erase)
 	{
 	    /* erase previous character */
 	    if (cnt <= 0)
 	    {
 		/* none to erase! */
 		putchar('\7');
 	    }
 	    else
 	    {
 		fputs("\b \b", stdout);
 		ptr--;
 		cnt--;
 	    }
 	}
 	/* check for character validity and buffer overflow */
 	else if (cnt == size || (numeric && !isdigit(ch)) ||
 		!isprint(ch))
 	{
 	    /* not legal */
 	    putchar('\7');
 	}
 	else
 	{
 	    /* echo it and store it in the buffer */
 	    putchar(ch);
 	    ptr++;
 	    cnt++;
 	    if (cnt > maxcnt)
 	    {
 		maxcnt = cnt;
 	    }
 	}
     }
 
     /* all done -- null terminate the string */
     *ptr = '\0';
 
     /* account for the extra characters in the message area */
     /* (if terminal overstrikes, remember the furthest they went) */
     msglen += overstrike ? maxcnt : cnt;
 
     /* return either inputted number or string length */
     putchar('\r');
     return(cnt == 0 ? -1 : numeric ? atoi(buffer) : cnt);
 }
 
 /* internal support routines */
 
 static void
 summary_format(char *str, int *numbers, const char * const *names)
 {
     char *p;
     int num;
     const char *thisname;
     char rbuf[6];
 
     /* format each number followed by its string */
     p = str;
     while ((thisname = *names++) != NULL)
     {
 	/* get the number to format */
 	num = *numbers++;
 
 	/* display only non-zero numbers */
 	if (num > 0)
 	{
 	    /* is this number in kilobytes? */
 	    if (thisname[0] == 'K')
 	    {
 		/* yes: format it as a memory value */
 		p = stpcpy(p, format_k(num));
 
 		/* skip over the K, since it was included by format_k */
 		p = stpcpy(p, thisname+1);
 	    }
 	    /* is this number a ratio? */
 	    else if (thisname[0] == ':')
 	    {
 		(void) snprintf(rbuf, sizeof(rbuf), "%.2f",
 		    (float)*(numbers - 2) / (float)num);
 		p = stpcpy(p, rbuf);
 		p = stpcpy(p, thisname);
 	    }
 	    else
 	    {
 		p = stpcpy(p, itoa(num));
 		p = stpcpy(p, thisname);
 	    }
 	}
 
 	/* ignore negative numbers, but display corresponding string */
 	else if (num < 0)
 	{
 	    p = stpcpy(p, thisname);
 	}
     }
 
     /* if the last two characters in the string are ", ", delete them */
     p -= 2;
     if (p >= str && p[0] == ',' && p[1] == ' ')
     {
 	*p = '\0';
     }
 }
 
 static void
 line_update(char *old, char *new, int start, int line)
 {
     int ch;
     int diff;
     int newcol = start + 1;
     int lastcol = start;
     char cursor_on_line = false;
     char *current;
 
     /* compare the two strings and only rewrite what has changed */
     current = old;
 #ifdef DEBUG
     fprintf(debug, "line_update, starting at %d\n", start);
     fputs(old, debug);
     fputc('\n', debug);
     fputs(new, debug);
     fputs("\n-\n", debug);
 #endif
 
     /* start things off on the right foot		    */
     /* this is to make sure the invariants get set up right */
     if ((ch = *new++) != *old)
     {
 	if (line - lastline == 1 && start == 0)
 	{
 	    putchar('\n');
 	}
 	else
 	{
 	    Move_to(start, line);
 	}
 	cursor_on_line = true;
 	putchar(ch);
 	*old = ch;
 	lastcol = start + 1;
     }
     old++;
 
     /*
      *  main loop -- check each character.  If the old and new aren't the
      *	same, then update the display.  When the distance from the
      *	current cursor position to the new change is small enough,
      *	the characters that belong there are written to move the
      *	cursor over.
      *
      *	Invariants:
      *	    lastcol is the column where the cursor currently is sitting
      *		(always one beyond the end of the last mismatch).
      */
     do		/* yes, a do...while */
     {
 	if ((ch = *new++) != *old)
 	{
 	    /* new character is different from old	  */
 	    /* make sure the cursor is on top of this character */
 	    diff = newcol - lastcol;
 	    if (diff > 0)
 	    {
 		/* some motion is required--figure out which is shorter */
 		if (diff < 6 && cursor_on_line)
 		{
 		    /* overwrite old stuff--get it out of the old buffer */
 		    printf("%.*s", diff, &current[lastcol-start]);
 		}
 		else
 		{
 		    /* use cursor addressing */
 		    Move_to(newcol, line);
 		    cursor_on_line = true;
 		}
 		/* remember where the cursor is */
 		lastcol = newcol + 1;
 	    }
 	    else
 	    {
 		/* already there, update position */
 		lastcol++;
 	    }
 
 	    /* write what we need to */
 	    if (ch == '\0')
 	    {
 		/* at the end--terminate with a clear-to-end-of-line */
 		(void) clear_eol(strlen(old));
 	    }
 	    else
 	    {
 		/* write the new character */
 		putchar(ch);
 	    }
 	    /* put the new character in the screen buffer */
 	    *old = ch;
 	}
 
 	/* update working column and screen buffer pointer */
 	newcol++;
 	old++;
 
     } while (ch != '\0');
 
     /* zero out the rest of the line buffer -- MUST BE DONE! */
     diff = screen_width - newcol;
     if (diff > 0)
     {
 	memset(old, 0, diff);
     }
 
     /* remember where the current line is */
     if (cursor_on_line)
     {
 	lastline = line;
     }
 }
 
-/*
- *  printable(str) - make the string pointed to by "str" into one that is
- *	printable (i.e.: all ascii), by converting all non-printable
- *	characters into '?'.  Replacements are done in place and a pointer
- *	to the original buffer is returned.
- */
-
-char *
-printable(char str[])
-{
-    char *ptr;
-    char ch;
-
-    ptr = str;
-    while ((ch = *ptr) != '\0')
-    {
-	if (!isprint(ch))
-	{
-	    *ptr = '?';
-	}
-	ptr++;
-    }
-    return(str);
-}
-
 void
 i_uptime(struct timeval *bt, time_t *tod)
 {
     time_t uptime;
     int days, hrs, mins, secs;
 
     if (bt->tv_sec != -1) {
 	uptime = *tod - bt->tv_sec;
 	days = uptime / 86400;
 	uptime %= 86400;
 	hrs = uptime / 3600;
 	uptime %= 3600;
 	mins = uptime / 60;
 	secs = uptime % 60;
 
 	/*
 	 *  Display the uptime.
 	 */
 
 	if (smart_terminal)
 	{
 	    Move_to((screen_width - 24) - (days > 9 ? 1 : 0), 0);
 	}
 	else
 	{
 	    fputs(" ", stdout);
 	}
 	printf(" up %d+%02d:%02d:%02d", days, hrs, mins, secs);
     }
 }
 
 #define SETUPBUFFER_MIN_SCREENWIDTH 80
 #define SETUPBUFFER_REQUIRED_ADDBUFSIZ 2
 
 static char *
 setup_buffer(char *buffer, int addlen)
 {
     size_t len, old_len;
     char *new_buffer;
 
     setup_buffer_bufsiz = screen_width;
     if (setup_buffer_bufsiz < SETUPBUFFER_MIN_SCREENWIDTH)
     {
 	setup_buffer_bufsiz = SETUPBUFFER_MIN_SCREENWIDTH;
     }
 
     len = setup_buffer_bufsiz + addlen + SETUPBUFFER_REQUIRED_ADDBUFSIZ;
     new_buffer = calloc(len, sizeof(char));
     if (new_buffer == NULL)
     {
 	errx(4, "can't allocate sufficient memory");
     }
     if (buffer != NULL)
     {
 	old_len = strlen(buffer);
 	memcpy(new_buffer, buffer, old_len < len - 1 ? old_len : len - 1);
 	free(buffer);
     }
 
     return new_buffer;
 }
Index: projects/clang900-import/usr.bin/top/display.h
===================================================================
--- projects/clang900-import/usr.bin/top/display.h	(revision 352586)
+++ projects/clang900-import/usr.bin/top/display.h	(revision 352587)
@@ -1,42 +1,41 @@
 /* $FreeBSD$ */
 /* constants needed for display.c */
 
 #define  MT_standout  1
 #define  MT_delayed   2
 
 #include <sys/time.h>
 struct statics;
 
 int		 display_updatecpus(struct statics *statics);
 void	 clear_message(void);
 int		 display_resize(void);
 void	 i_header(const char *text);
-char	*printable(char *string);
 void	 display_header(int t);
 int		 display_init(struct statics *statics);
 void	 i_arc(int *stats);
 void	 i_carc(int *stats);
 void	 i_cpustates(int *states);
 void	 i_loadave(int mpid, double *avenrun);
 void	 i_memory(int *stats);
 void	 i_message(void);
 void	 i_process(int line, char *thisline);
 void	 i_procstates(int total, int *brkdn);
 void	 i_swap(int *stats);
 void	 i_timeofday(time_t *tod);
 void	 i_uptime(struct timeval *bt, time_t *tod);
 void	 new_message(int type, const char *msgfmt, ...);
 int	 readline(char *buffer, int size, int numeric);
 char	*trim_header(const char *text);
 void	 u_arc(int *stats);
 void	 u_carc(int *stats);
 void	 u_cpustates(int *states);
 void	 u_endscreen(int hi);
 void	 u_header(const char *text);
 void	 u_loadave(int mpid, double *avenrun);
 void	 u_memory(int *stats);
 void	 u_message(void);
 void	 u_process(int line, char *newline);
 void	 u_procstates(int total, int *brkdn);
 void	 u_swap(int *stats);
 void	 z_cpustates(void);
Index: projects/clang900-import/usr.bin/top/machine.c
===================================================================
--- projects/clang900-import/usr.bin/top/machine.c	(revision 352586)
+++ projects/clang900-import/usr.bin/top/machine.c	(revision 352587)
@@ -1,1561 +1,1561 @@
 /*
  * top - a top users display for Unix
  *
  * DESCRIPTION:
  * Originally written for BSD4.4 system by Christos Zoulas.
  * Ported to FreeBSD 2.x by Steven Wallace && Wolfram Schneider
  * Order support hacked in from top-3.5beta6/machine/m_aix41.c
  *   by Monte Mitzelfelt (for latest top see http://www.groupsys.com/topinfo/)
  *
  * AUTHOR:  Christos Zoulas <christos@ee.cornell.edu>
  *          Steven Wallace  <swallace@FreeBSD.org>
  *          Wolfram Schneider <wosch@FreeBSD.org>
  *          Thomas Moestl <tmoestl@gmx.net>
  *          Eitan Adler <eadler@FreeBSD.org>
  *
  * $FreeBSD$
  */
 
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 #include <sys/param.h>
 #include <sys/priority.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/user.h>
 
 #include <assert.h>
 #include <err.h>
 #include <libgen.h>
 #include <kvm.h>
 #include <math.h>
 #include <paths.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 #include <vis.h>
 
 #include "top.h"
 #include "display.h"
 #include "machine.h"
 #include "loadavg.h"
 #include "screen.h"
 #include "utils.h"
 #include "layout.h"
 
 #define GETSYSCTL(name, var) getsysctl(name, &(var), sizeof(var))
 
 extern struct timeval timeout;
 static int smpmode;
 enum displaymodes displaymode;
 static const int namelength = 10;
 /* TOP_JID_LEN based on max of 999999 */
 #define TOP_JID_LEN 6
 #define TOP_SWAP_LEN 5
 
 /* get_process_info passes back a handle.  This is what it looks like: */
 
 struct handle {
 	struct kinfo_proc **next_proc;	/* points to next valid proc pointer */
 	int remaining;			/* number of pointers remaining */
 };
 
 
 /* define what weighted cpu is.  */
 #define weighted_cpu(pct, pp) ((pp)->ki_swtime == 0 ? 0.0 : \
 			 ((pct) / (1.0 - exp((pp)->ki_swtime * logcpu))))
 
 /* what we consider to be process size: */
 #define PROCSIZE(pp) ((pp)->ki_size / 1024)
 
 #define RU(pp)	(&(pp)->ki_rusage)
 
 #define	PCTCPU(pp) (pcpu[pp - pbase])
 
 /* process state names for the "STATE" column of the display */
 /* the extra nulls in the string "run" are for adding a slash and
    the processor number when needed */
 
 static const char *state_abbrev[] = {
 	"", "START", "RUN\0\0\0", "SLEEP", "STOP", "ZOMB", "WAIT", "LOCK"
 };
 
 
 static kvm_t *kd;
 
 /* values that we stash away in _init and use in later routines */
 
 static double logcpu;
 
 /* these are retrieved from the kernel in _init */
 
 static load_avg  ccpu;
 
 /* these are used in the get_ functions */
 
 static int lastpid;
 
 /* these are for calculating cpu state percentages */
 
 static long cp_time[CPUSTATES];
 static long cp_old[CPUSTATES];
 static long cp_diff[CPUSTATES];
 
 /* these are for detailing the process states */
 
 static const char *procstatenames[] = {
 	"", " starting, ", " running, ", " sleeping, ", " stopped, ",
 	" zombie, ", " waiting, ", " lock, ",
 	NULL
 };
 static int process_states[nitems(procstatenames)];
 
 /* these are for detailing the cpu states */
 
 static int cpu_states[CPUSTATES];
 static const char *cpustatenames[] = {
 	"user", "nice", "system", "interrupt", "idle", NULL
 };
 
 /* these are for detailing the memory statistics */
 
 static const char *memorynames[] = {
 	"K Active, ", "K Inact, ", "K Laundry, ", "K Wired, ", "K Buf, ",
 	"K Free", NULL
 };
 static int memory_stats[nitems(memorynames)];
 
 static const char *arcnames[] = {
 	"K Total, ", "K MFU, ", "K MRU, ", "K Anon, ", "K Header, ", "K Other",
 	NULL
 };
 static int arc_stats[nitems(arcnames)];
 
 static const char *carcnames[] = {
 	"K Compressed, ", "K Uncompressed, ", ":1 Ratio, ",
 	NULL
 };
 static int carc_stats[nitems(carcnames)];
 
 static const char *swapnames[] = {
 	"K Total, ", "K Used, ", "K Free, ", "% Inuse, ", "K In, ", "K Out",
 	NULL
 };
 static int swap_stats[nitems(swapnames)];
 
 static int has_swap;
 
 /* these are for keeping track of the proc array */
 
 static int nproc;
 static int onproc = -1;
 static int pref_len;
 static struct kinfo_proc *pbase;
 static struct kinfo_proc **pref;
 static struct kinfo_proc *previous_procs;
 static struct kinfo_proc **previous_pref;
 static int previous_proc_count = 0;
 static int previous_proc_count_max = 0;
 static int previous_thread;
 
 /* data used for recalculating pctcpu */
 static double *pcpu;
 static struct timespec proc_uptime;
 static struct timeval proc_wall_time;
 static struct timeval previous_wall_time;
 static uint64_t previous_interval = 0;
 
 /* total number of io operations */
 static long total_inblock;
 static long total_oublock;
 static long total_majflt;
 
 /* these are for getting the memory statistics */
 
 static int arc_enabled;
 static int carc_enabled;
 static int pageshift;		/* log base 2 of the pagesize */
 
 /* define pagetok in terms of pageshift */
 
 #define pagetok(size) ((size) << pageshift)
 
 /* swap usage */
 #define ki_swap(kip) \
     ((kip)->ki_swrss > (kip)->ki_rssize ? (kip)->ki_swrss - (kip)->ki_rssize : 0)
 
 /*
  * Sorting orders.  The first element is the default.
  */
 static const char *ordernames[] = {
 	"cpu", "size", "res", "time", "pri", "threads",
 	"total", "read", "write", "fault", "vcsw", "ivcsw",
 	"jid", "swap", "pid", NULL
 };
 
 /* Per-cpu time states */
 static int maxcpu;
 static int maxid;
 static int ncpus;
 static unsigned long cpumask;
 static long *times;
 static long *pcpu_cp_time;
 static long *pcpu_cp_old;
 static long *pcpu_cp_diff;
 static int *pcpu_cpu_states;
 
 static int compare_swap(const void *a, const void *b);
 static int compare_jid(const void *a, const void *b);
 static int compare_pid(const void *a, const void *b);
 static int compare_tid(const void *a, const void *b);
 static const char *format_nice(const struct kinfo_proc *pp);
 static void getsysctl(const char *name, void *ptr, size_t len);
 static int swapmode(int *retavail, int *retfree);
 static void update_layout(void);
 static int find_uid(uid_t needle, int *haystack);
 
 static int
 find_uid(uid_t needle, int *haystack)
 {
 	size_t i = 0;
 
 	for (; i < TOP_MAX_UIDS; ++i)
 		if ((uid_t)haystack[i] == needle)
 			return 1;
 	return (0);
 }
 
 void
 toggle_pcpustats(void)
 {
 
 	if (ncpus == 1)
 		return;
 	update_layout();
 }
 
 /* Adjust display based on ncpus and the ARC state. */
 static void
 update_layout(void)
 {
 
 	y_mem = 3;
 	y_arc = 4;
 	y_carc = 5;
 	y_swap = 3 + arc_enabled + carc_enabled + has_swap;
 	y_idlecursor = 4 + arc_enabled + carc_enabled + has_swap;
 	y_message = 4 + arc_enabled + carc_enabled + has_swap;
 	y_header = 5 + arc_enabled + carc_enabled + has_swap;
 	y_procs = 6 + arc_enabled + carc_enabled + has_swap;
 	Header_lines = 6 + arc_enabled + carc_enabled + has_swap;
 
 	if (pcpu_stats) {
 		y_mem += ncpus - 1;
 		y_arc += ncpus - 1;
 		y_carc += ncpus - 1;
 		y_swap += ncpus - 1;
 		y_idlecursor += ncpus - 1;
 		y_message += ncpus - 1;
 		y_header += ncpus - 1;
 		y_procs += ncpus - 1;
 		Header_lines += ncpus - 1;
 	}
 }
 
 int
 machine_init(struct statics *statics)
 {
 	int i, j, empty, pagesize;
 	uint64_t arc_size;
 	int carc_en, nswapdev;
 	size_t size;
 
 	size = sizeof(smpmode);
 	if ((sysctlbyname("machdep.smp_active", &smpmode, &size,
 	    NULL, 0) != 0 &&
 	    sysctlbyname("kern.smp.active", &smpmode, &size,
 	    NULL, 0) != 0) ||
 	    size != sizeof(smpmode))
 		smpmode = 0;
 
 	size = sizeof(arc_size);
 	if (sysctlbyname("kstat.zfs.misc.arcstats.size", &arc_size, &size,
 	    NULL, 0) == 0 && arc_size != 0)
 		arc_enabled = 1;
 	size = sizeof(carc_en);
 	if (arc_enabled &&
 	    sysctlbyname("vfs.zfs.compressed_arc_enabled", &carc_en, &size,
 	    NULL, 0) == 0 && carc_en == 1)
 		carc_enabled = 1;
 
 	kd = kvm_open(NULL, _PATH_DEVNULL, NULL, O_RDONLY, "kvm_open");
 	if (kd == NULL)
 		return (-1);
 
 	size = sizeof(nswapdev);
 	if (sysctlbyname("vm.nswapdev", &nswapdev, &size, NULL,
 		0) == 0 && nswapdev != 0)
 			has_swap = 1;
 
 	GETSYSCTL("kern.ccpu", ccpu);
 
 	/* this is used in calculating WCPU -- calculate it ahead of time */
 	logcpu = log(loaddouble(ccpu));
 
 	pbase = NULL;
 	pref = NULL;
 	pcpu = NULL;
 	nproc = 0;
 	onproc = -1;
 
 	/* get the page size and calculate pageshift from it */
 	pagesize = getpagesize();
 	pageshift = 0;
 	while (pagesize > 1) {
 		pageshift++;
 		pagesize >>= 1;
 	}
 
 	/* we only need the amount of log(2)1024 for our conversion */
 	pageshift -= LOG1024;
 
 	/* fill in the statics information */
 	statics->procstate_names = procstatenames;
 	statics->cpustate_names = cpustatenames;
 	statics->memory_names = memorynames;
 	if (arc_enabled)
 		statics->arc_names = arcnames;
 	else
 		statics->arc_names = NULL;
 	if (carc_enabled)
 		statics->carc_names = carcnames;
 	else
 		statics->carc_names = NULL;
 	if (has_swap)
 		statics->swap_names = swapnames;
 	else
 		statics->swap_names = NULL;
 	statics->order_names = ordernames;
 
 	/* Allocate state for per-CPU stats. */
 	cpumask = 0;
 	ncpus = 0;
 	GETSYSCTL("kern.smp.maxcpus", maxcpu);
 	times = calloc(maxcpu * CPUSTATES, sizeof(long));
 	if (times == NULL)
 		err(1, "calloc for kern.smp.maxcpus");
 	size = sizeof(long) * maxcpu * CPUSTATES;
 	if (sysctlbyname("kern.cp_times", times, &size, NULL, 0) == -1)
 		err(1, "sysctlbyname kern.cp_times");
 	pcpu_cp_time = calloc(1, size);
 	maxid = (size / CPUSTATES / sizeof(long)) - 1;
 	for (i = 0; i <= maxid; i++) {
 		empty = 1;
 		for (j = 0; empty && j < CPUSTATES; j++) {
 			if (times[i * CPUSTATES + j] != 0)
 				empty = 0;
 		}
 		if (!empty) {
 			cpumask |= (1ul << i);
 			ncpus++;
 		}
 	}
 	assert(ncpus > 0);
 	pcpu_cp_old = calloc(ncpus * CPUSTATES, sizeof(long));
 	pcpu_cp_diff = calloc(ncpus * CPUSTATES, sizeof(long));
 	pcpu_cpu_states = calloc(ncpus * CPUSTATES, sizeof(int));
 	statics->ncpus = ncpus;
 
 	update_layout();
 
 	/* all done! */
 	return (0);
 }
 
 char *
 format_header(const char *uname_field)
 {
 	static struct sbuf* header = NULL;
 
 	/* clean up from last time. */
 	if (header != NULL) {
 		sbuf_clear(header);
 	} else {
 		header = sbuf_new_auto();
 	}
 
 	switch (displaymode) {
 	case DISP_CPU: {
 		sbuf_printf(header, "  %s", ps.thread_id ? " THR" : "PID");
 		sbuf_printf(header, "%*s", ps.jail ? TOP_JID_LEN : 0,
 									ps.jail ? " JID" : "");
 		sbuf_printf(header, " %-*.*s  ", namelength, namelength, uname_field);
 		if (!ps.thread) {
 			sbuf_cat(header, "THR ");
 		}
 		sbuf_cat(header, "PRI NICE   SIZE    RES ");
 		if (ps.swap) {
 			sbuf_printf(header, "%*s ", TOP_SWAP_LEN - 1, "SWAP");
 		}
 		sbuf_cat(header, "STATE    ");
 		if (smpmode) {
 			sbuf_cat(header, "C   ");
 		}
 		sbuf_cat(header, "TIME ");
 		sbuf_printf(header, " %6s ", ps.wcpu ? "WCPU" : "CPU");
 		sbuf_cat(header, "COMMAND");
 		sbuf_finish(header);
 		break;
 	}
 	case DISP_IO: {
 		sbuf_printf(header, "  %s%*s %-*.*s",
 			ps.thread_id ? " THR" : "PID",
 		    ps.jail ? TOP_JID_LEN : 0, ps.jail ? " JID" : "",
 		    namelength, namelength, uname_field);
 		sbuf_cat(header, "   VCSW  IVCSW   READ  WRITE  FAULT  TOTAL PERCENT COMMAND");
 		sbuf_finish(header);
 		break;
 	}
 	case DISP_MAX:
 		assert("displaymode must not be set to DISP_MAX");
 	}
 
 	return sbuf_data(header);
 }
 
 static int swappgsin = -1;
 static int swappgsout = -1;
 
 
 void
 get_system_info(struct system_info *si)
 {
 	struct loadavg sysload;
 	int mib[2];
 	struct timeval boottime;
 	uint64_t arc_stat, arc_stat2;
 	int i, j;
 	size_t size;
 
 	/* get the CPU stats */
 	size = (maxid + 1) * CPUSTATES * sizeof(long);
 	if (sysctlbyname("kern.cp_times", pcpu_cp_time, &size, NULL, 0) == -1)
 		err(1, "sysctlbyname kern.cp_times");
 	GETSYSCTL("kern.cp_time", cp_time);
 	GETSYSCTL("vm.loadavg", sysload);
 	GETSYSCTL("kern.lastpid", lastpid);
 
 	/* convert load averages to doubles */
 	for (i = 0; i < 3; i++)
 		si->load_avg[i] = (double)sysload.ldavg[i] / sysload.fscale;
 
 	/* convert cp_time counts to percentages */
 	for (i = j = 0; i <= maxid; i++) {
 		if ((cpumask & (1ul << i)) == 0)
 			continue;
 		percentages(CPUSTATES, &pcpu_cpu_states[j * CPUSTATES],
 		    &pcpu_cp_time[j * CPUSTATES],
 		    &pcpu_cp_old[j * CPUSTATES],
 		    &pcpu_cp_diff[j * CPUSTATES]);
 		j++;
 	}
 	percentages(CPUSTATES, cpu_states, cp_time, cp_old, cp_diff);
 
 	/* sum memory & swap statistics */
 	{
 		static unsigned int swap_delay = 0;
 		static int swapavail = 0;
 		static int swapfree = 0;
 		static long bufspace = 0;
 		static uint64_t nspgsin, nspgsout;
 
 		GETSYSCTL("vfs.bufspace", bufspace);
 		GETSYSCTL("vm.stats.vm.v_active_count", memory_stats[0]);
 		GETSYSCTL("vm.stats.vm.v_inactive_count", memory_stats[1]);
 		GETSYSCTL("vm.stats.vm.v_laundry_count", memory_stats[2]);
 		GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[3]);
 		GETSYSCTL("vm.stats.vm.v_free_count", memory_stats[5]);
 		GETSYSCTL("vm.stats.vm.v_swappgsin", nspgsin);
 		GETSYSCTL("vm.stats.vm.v_swappgsout", nspgsout);
 		/* convert memory stats to Kbytes */
 		memory_stats[0] = pagetok(memory_stats[0]);
 		memory_stats[1] = pagetok(memory_stats[1]);
 		memory_stats[2] = pagetok(memory_stats[2]);
 		memory_stats[3] = pagetok(memory_stats[3]);
 		memory_stats[4] = bufspace / 1024;
 		memory_stats[5] = pagetok(memory_stats[5]);
 		memory_stats[6] = -1;
 
 		/* first interval */
 		if (swappgsin < 0) {
 			swap_stats[4] = 0;
 			swap_stats[5] = 0;
 		}
 
 		/* compute differences between old and new swap statistic */
 		else {
 			swap_stats[4] = pagetok(((nspgsin - swappgsin)));
 			swap_stats[5] = pagetok(((nspgsout - swappgsout)));
 		}
 
 		swappgsin = nspgsin;
 		swappgsout = nspgsout;
 
 		/* call CPU heavy swapmode() only for changes */
 		if (swap_stats[4] > 0 || swap_stats[5] > 0 || swap_delay == 0) {
 			swap_stats[3] = swapmode(&swapavail, &swapfree);
 			swap_stats[0] = swapavail;
 			swap_stats[1] = swapavail - swapfree;
 			swap_stats[2] = swapfree;
 		}
 		swap_delay = 1;
 		swap_stats[6] = -1;
 	}
 
 	if (arc_enabled) {
 		GETSYSCTL("kstat.zfs.misc.arcstats.size", arc_stat);
 		arc_stats[0] = arc_stat >> 10;
 		GETSYSCTL("vfs.zfs.mfu_size", arc_stat);
 		arc_stats[1] = arc_stat >> 10;
 		GETSYSCTL("vfs.zfs.mru_size", arc_stat);
 		arc_stats[2] = arc_stat >> 10;
 		GETSYSCTL("vfs.zfs.anon_size", arc_stat);
 		arc_stats[3] = arc_stat >> 10;
 		GETSYSCTL("kstat.zfs.misc.arcstats.hdr_size", arc_stat);
 		GETSYSCTL("kstat.zfs.misc.arcstats.l2_hdr_size", arc_stat2);
 		arc_stats[4] = (arc_stat + arc_stat2) >> 10;
 		GETSYSCTL("kstat.zfs.misc.arcstats.bonus_size", arc_stat);
 		arc_stats[5] = arc_stat >> 10;
 		GETSYSCTL("kstat.zfs.misc.arcstats.dnode_size", arc_stat);
 		arc_stats[5] += arc_stat >> 10;
 		GETSYSCTL("kstat.zfs.misc.arcstats.dbuf_size", arc_stat);
 		arc_stats[5] += arc_stat >> 10;
 		si->arc = arc_stats;
 	}
 	if (carc_enabled) {
 		GETSYSCTL("kstat.zfs.misc.arcstats.compressed_size", arc_stat);
 		carc_stats[0] = arc_stat >> 10;
 		carc_stats[2] = arc_stat >> 10; /* For ratio */
 		GETSYSCTL("kstat.zfs.misc.arcstats.uncompressed_size", arc_stat);
 		carc_stats[1] = arc_stat >> 10;
 		si->carc = carc_stats;
 	}
 
 	/* set arrays and strings */
 	if (pcpu_stats) {
 		si->cpustates = pcpu_cpu_states;
 		si->ncpus = ncpus;
 	} else {
 		si->cpustates = cpu_states;
 		si->ncpus = 1;
 	}
 	si->memory = memory_stats;
 	si->swap = swap_stats;
 
 
 	if (lastpid > 0) {
 		si->last_pid = lastpid;
 	} else {
 		si->last_pid = -1;
 	}
 
 	/*
 	 * Print how long system has been up.
 	 * (Found by looking getting "boottime" from the kernel)
 	 */
 	mib[0] = CTL_KERN;
 	mib[1] = KERN_BOOTTIME;
 	size = sizeof(boottime);
 	if (sysctl(mib, nitems(mib), &boottime, &size, NULL, 0) != -1 &&
 	    boottime.tv_sec != 0) {
 		si->boottime = boottime;
 	} else {
 		si->boottime.tv_sec = -1;
 	}
 }
 
 #define NOPROC	((void *)-1)
 
 /*
  * We need to compare data from the old process entry with the new
  * process entry.
  * To facilitate doing this quickly we stash a pointer in the kinfo_proc
  * structure to cache the mapping.  We also use a negative cache pointer
  * of NOPROC to avoid duplicate lookups.
  * XXX: this could be done when the actual processes are fetched, we do
  * it here out of laziness.
  */
 static const struct kinfo_proc *
 get_old_proc(struct kinfo_proc *pp)
 {
 	const struct kinfo_proc * const *oldpp, *oldp;
 
 	/*
 	 * If this is the first fetch of the kinfo_procs then we don't have
 	 * any previous entries.
 	 */
 	if (previous_proc_count == 0)
 		return (NULL);
 	/* negative cache? */
 	if (pp->ki_udata == NOPROC)
 		return (NULL);
 	/* cached? */
 	if (pp->ki_udata != NULL)
 		return (pp->ki_udata);
 	/*
 	 * Not cached,
 	 * 1) look up based on pid.
 	 * 2) compare process start.
 	 * If we fail here, then setup a negative cache entry, otherwise
 	 * cache it.
 	 */
 	oldpp = bsearch(&pp, previous_pref, previous_proc_count,
 	    sizeof(*previous_pref), ps.thread ? compare_tid : compare_pid);
 	if (oldpp == NULL) {
 		pp->ki_udata = NOPROC;
 		return (NULL);
 	}
 	oldp = *oldpp;
 	if (memcmp(&oldp->ki_start, &pp->ki_start, sizeof(pp->ki_start)) != 0) {
 		pp->ki_udata = NOPROC;
 		return (NULL);
 	}
 	pp->ki_udata = __DECONST(void *, oldp);
 	return (oldp);
 }
 
 /*
  * Return the total amount of IO done in blocks in/out and faults.
  * store the values individually in the pointers passed in.
  */
 static long
 get_io_stats(const struct kinfo_proc *pp, long *inp, long *oup, long *flp,
     long *vcsw, long *ivcsw)
 {
 	const struct kinfo_proc *oldp;
 	static struct kinfo_proc dummy;
 	long ret;
 
 	oldp = get_old_proc(__DECONST(struct kinfo_proc *, pp));
 	if (oldp == NULL) {
 		memset(&dummy, 0, sizeof(dummy));
 		oldp = &dummy;
 	}
 	*inp = RU(pp)->ru_inblock - RU(oldp)->ru_inblock;
 	*oup = RU(pp)->ru_oublock - RU(oldp)->ru_oublock;
 	*flp = RU(pp)->ru_majflt - RU(oldp)->ru_majflt;
 	*vcsw = RU(pp)->ru_nvcsw - RU(oldp)->ru_nvcsw;
 	*ivcsw = RU(pp)->ru_nivcsw - RU(oldp)->ru_nivcsw;
 	ret =
 	    (RU(pp)->ru_inblock - RU(oldp)->ru_inblock) +
 	    (RU(pp)->ru_oublock - RU(oldp)->ru_oublock) +
 	    (RU(pp)->ru_majflt - RU(oldp)->ru_majflt);
 	return (ret);
 }
 
 /*
  * If there was a previous update, use the delta in ki_runtime over
  * the previous interval to calculate pctcpu.  Otherwise, fall back
  * to using the kernel's ki_pctcpu.
  */
 static double
 proc_calc_pctcpu(struct kinfo_proc *pp)
 {
 	const struct kinfo_proc *oldp;
 
 	if (previous_interval != 0) {
 		oldp = get_old_proc(pp);
 		if (oldp != NULL)
 			return ((double)(pp->ki_runtime - oldp->ki_runtime)
 			    / previous_interval);
 
 		/*
 		 * If this process/thread was created during the previous
 		 * interval, charge it's total runtime to the previous
 		 * interval.
 		 */
 		else if (pp->ki_start.tv_sec > previous_wall_time.tv_sec ||
 		    (pp->ki_start.tv_sec == previous_wall_time.tv_sec &&
 		    pp->ki_start.tv_usec >= previous_wall_time.tv_usec))
 			return ((double)pp->ki_runtime / previous_interval);
 	}
 	return (pctdouble(pp->ki_pctcpu));
 }
 
 /*
  * Return true if this process has used any CPU time since the
  * previous update.
  */
 static int
 proc_used_cpu(struct kinfo_proc *pp)
 {
 	const struct kinfo_proc *oldp;
 
 	oldp = get_old_proc(pp);
 	if (oldp == NULL)
 		return (PCTCPU(pp) != 0);
 	return (pp->ki_runtime != oldp->ki_runtime ||
 	    RU(pp)->ru_nvcsw != RU(oldp)->ru_nvcsw ||
 	    RU(pp)->ru_nivcsw != RU(oldp)->ru_nivcsw);
 }
 
 /*
  * Return the total number of block in/out and faults by a process.
  */
 static long
 get_io_total(const struct kinfo_proc *pp)
 {
 	long dummy;
 
 	return (get_io_stats(pp, &dummy, &dummy, &dummy, &dummy, &dummy));
 }
 
 static struct handle handle;
 
 void *
 get_process_info(struct system_info *si, struct process_select *sel,
     int (*compare)(const void *, const void *))
 {
 	int i;
 	int total_procs;
 	long p_io;
 	long p_inblock, p_oublock, p_majflt, p_vcsw, p_ivcsw;
 	long nsec;
 	int active_procs;
 	struct kinfo_proc **prefp;
 	struct kinfo_proc *pp;
 	struct timespec previous_proc_uptime;
 
 	/*
 	 * If thread state was toggled, don't cache the previous processes.
 	 */
 	if (previous_thread != sel->thread)
 		nproc = 0;
 	previous_thread = sel->thread;
 
 	/*
 	 * Save the previous process info.
 	 */
 	if (previous_proc_count_max < nproc) {
 		free(previous_procs);
 		previous_procs = calloc(nproc, sizeof(*previous_procs));
 		free(previous_pref);
 		previous_pref = calloc(nproc, sizeof(*previous_pref));
 		if (previous_procs == NULL || previous_pref == NULL) {
 			fprintf(stderr, "top: Out of memory.\n");
 			quit(TOP_EX_SYS_ERROR);
 		}
 		previous_proc_count_max = nproc;
 	}
 	if (nproc) {
 		for (i = 0; i < nproc; i++)
 			previous_pref[i] = &previous_procs[i];
 		memcpy(previous_procs, pbase, nproc * sizeof(*previous_procs));
 		qsort(previous_pref, nproc, sizeof(*previous_pref),
 		    ps.thread ? compare_tid : compare_pid);
 	}
 	previous_proc_count = nproc;
 	previous_proc_uptime = proc_uptime;
 	previous_wall_time = proc_wall_time;
 	previous_interval = 0;
 
 	pbase = kvm_getprocs(kd, sel->thread ? KERN_PROC_ALL : KERN_PROC_PROC,
 	    0, &nproc);
 	gettimeofday(&proc_wall_time, NULL);
 	if (clock_gettime(CLOCK_UPTIME, &proc_uptime) != 0)
 		memset(&proc_uptime, 0, sizeof(proc_uptime));
 	else if (previous_proc_uptime.tv_sec != 0 &&
 	    previous_proc_uptime.tv_nsec != 0) {
 		previous_interval = (proc_uptime.tv_sec -
 		    previous_proc_uptime.tv_sec) * 1000000;
 		nsec = proc_uptime.tv_nsec - previous_proc_uptime.tv_nsec;
 		if (nsec < 0) {
 			previous_interval -= 1000000;
 			nsec += 1000000000;
 		}
 		previous_interval += nsec / 1000;
 	}
 	if (nproc > onproc) {
 		pref = realloc(pref, sizeof(*pref) * nproc);
 		pcpu = realloc(pcpu, sizeof(*pcpu) * nproc);
 		onproc = nproc;
 	}
 	if (pref == NULL || pbase == NULL || pcpu == NULL) {
 		fprintf(stderr, "top: Out of memory.\n");
 		quit(TOP_EX_SYS_ERROR);
 	}
 	/* get a pointer to the states summary array */
 	si->procstates = process_states;
 
 	/* count up process states and get pointers to interesting procs */
 	total_procs = 0;
 	active_procs = 0;
 	total_inblock = 0;
 	total_oublock = 0;
 	total_majflt = 0;
 	memset(process_states, 0, sizeof(process_states));
 	prefp = pref;
 	for (pp = pbase, i = 0; i < nproc; pp++, i++) {
 
 		if (pp->ki_stat == 0)
 			/* not in use */
 			continue;
 
 		if (!sel->self && pp->ki_pid == mypid && sel->pid == -1)
 			/* skip self */
 			continue;
 
 		if (!sel->system && (pp->ki_flag & P_SYSTEM) && sel->pid == -1)
 			/* skip system process */
 			continue;
 
 		p_io = get_io_stats(pp, &p_inblock, &p_oublock, &p_majflt,
 		    &p_vcsw, &p_ivcsw);
 		total_inblock += p_inblock;
 		total_oublock += p_oublock;
 		total_majflt += p_majflt;
 		total_procs++;
 		process_states[(unsigned char)pp->ki_stat]++;
 
 		if (pp->ki_stat == SZOMB)
 			/* skip zombies */
 			continue;
 
 		if (!sel->kidle && pp->ki_tdflags & TDF_IDLETD && sel->pid == -1)
 			/* skip kernel idle process */
 			continue;
 
 		PCTCPU(pp) = proc_calc_pctcpu(pp);
 		if (sel->thread && PCTCPU(pp) > 1.0)
 			PCTCPU(pp) = 1.0;
 		if (displaymode == DISP_CPU && !sel->idle &&
 		    (!proc_used_cpu(pp) ||
 		     pp->ki_stat == SSTOP || pp->ki_stat == SIDL))
 			/* skip idle or non-running processes */
 			continue;
 
 		if (displaymode == DISP_IO && !sel->idle && p_io == 0)
 			/* skip processes that aren't doing I/O */
 			continue;
 
 		if (sel->jid != -1 && pp->ki_jid != sel->jid)
 			/* skip proc. that don't belong to the selected JID */
 			continue;
 
 		if (sel->uid[0] != -1 && !find_uid(pp->ki_ruid, sel->uid))
 			/* skip proc. that don't belong to the selected UID */
 			continue;
 
 		if (sel->pid != -1 && pp->ki_pid != sel->pid)
 			continue;
 
 		*prefp++ = pp;
 		active_procs++;
 	}
 
 	/* if requested, sort the "interesting" processes */
 	if (compare != NULL)
 		qsort(pref, active_procs, sizeof(*pref), compare);
 
 	/* remember active and total counts */
 	si->p_total = total_procs;
 	si->p_pactive = pref_len = active_procs;
 
 	/* pass back a handle */
 	handle.next_proc = pref;
 	handle.remaining = active_procs;
 	return (&handle);
 }
 
 char *
 format_next_process(struct handle * xhandle, char *(*get_userid)(int), int flags)
 {
 	struct kinfo_proc *pp;
 	const struct kinfo_proc *oldp;
 	long cputime;
 	char status[22];
 	size_t state;
 	struct rusage ru, *rup;
 	long p_tot, s_tot;
 	char *cmdbuf = NULL;
 	char **args;
 	static struct sbuf* procbuf = NULL;
 
 	/* clean up from last time. */
 	if (procbuf != NULL) {
 		sbuf_clear(procbuf);
 	} else {
 		procbuf = sbuf_new_auto();
 	}
 
 
 	/* find and remember the next proc structure */
 	pp = *(xhandle->next_proc++);
 	xhandle->remaining--;
 
 	/* get the process's command name */
 	if ((pp->ki_flag & P_INMEM) == 0) {
 		/*
 		 * Print swapped processes as <pname>
 		 */
 		size_t len;
 
 		len = strlen(pp->ki_comm);
 		if (len > sizeof(pp->ki_comm) - 3)
 			len = sizeof(pp->ki_comm) - 3;
 		memmove(pp->ki_comm + 1, pp->ki_comm, len);
 		pp->ki_comm[0] = '<';
 		pp->ki_comm[len + 1] = '>';
 		pp->ki_comm[len + 2] = '\0';
 	}
 
 	/*
 	 * Convert the process's runtime from microseconds to seconds.  This
 	 * time includes the interrupt time although that is not wanted here.
 	 * ps(1) is similarly sloppy.
 	 */
 	cputime = (pp->ki_runtime + 500000) / 1000000;
 
 	/* generate "STATE" field */
 	switch (state = pp->ki_stat) {
 	case SRUN:
 		if (smpmode && pp->ki_oncpu != NOCPU)
 			sprintf(status, "CPU%d", pp->ki_oncpu);
 		else
 			strcpy(status, "RUN");
 		break;
 	case SLOCK:
 		if (pp->ki_kiflag & KI_LOCKBLOCK) {
 			sprintf(status, "*%.6s", pp->ki_lockname);
 			break;
 		}
 		/* fall through */
 	case SSLEEP:
 		sprintf(status, "%.6s", pp->ki_wmesg);
 		break;
 	default:
 
 		if (state < nitems(state_abbrev)) {
 			sprintf(status, "%.6s", state_abbrev[state]);
 		} else {
 			sprintf(status, "?%5zu", state);
 		}
 		break;
 	}
 
 	cmdbuf = calloc(screen_width + 1, 1);
 	if (cmdbuf == NULL) {
 		warn("calloc(%d)", screen_width + 1);
 		return NULL;
 	}
 
 	if (!(flags & FMT_SHOWARGS)) {
 		if (ps.thread && pp->ki_flag & P_HADTHREADS &&
 		    pp->ki_tdname[0]) {
 			snprintf(cmdbuf, screen_width, "%s{%s%s}", pp->ki_comm,
 			    pp->ki_tdname, pp->ki_moretdname);
 		} else {
 			snprintf(cmdbuf, screen_width, "%s", pp->ki_comm);
 		}
 	} else {
 		if (pp->ki_flag & P_SYSTEM ||
 		    (args = kvm_getargv(kd, pp, screen_width)) == NULL ||
 		    !(*args)) {
 			if (ps.thread && pp->ki_flag & P_HADTHREADS &&
 		    	    pp->ki_tdname[0]) {
 				snprintf(cmdbuf, screen_width,
 				    "[%s{%s%s}]", pp->ki_comm, pp->ki_tdname,
 				    pp->ki_moretdname);
 			} else {
 				snprintf(cmdbuf, screen_width,
 				    "[%s]", pp->ki_comm);
 			}
 		} else {
 			const char *src;
 			char *dst, *argbuf;
 			const char *cmd;
 			size_t argbuflen;
 			size_t len;
 
 			argbuflen = screen_width * 4;
 			argbuf = calloc(argbuflen + 1, 1);
 			if (argbuf == NULL) {
 				warn("calloc(%zu)", argbuflen + 1);
 				free(cmdbuf);
 				return NULL;
 			}
 
 			dst = argbuf;
 
 			/* Extract cmd name from argv */
 			cmd = basename(*args);
 
 			for (; (src = *args++) != NULL; ) {
 				if (*src == '\0')
 					continue;
 				len = (argbuflen - (dst - argbuf) - 1) / 4;
 				strvisx(dst, src,
 				    MIN(strlen(src), len),
-				    VIS_NL | VIS_CSTYLE);
+				    VIS_NL | VIS_CSTYLE | VIS_OCTAL | VIS_SAFE);
 				while (*dst != '\0')
 					dst++;
 				if ((argbuflen - (dst - argbuf) - 1) / 4 > 0)
 					*dst++ = ' '; /* add delimiting space */
 			}
 			if (dst != argbuf && dst[-1] == ' ')
 				dst--;
 			*dst = '\0';
 
 			if (strcmp(cmd, pp->ki_comm) != 0) {
 				if (ps.thread && pp->ki_flag & P_HADTHREADS &&
 				    pp->ki_tdname[0])
 					snprintf(cmdbuf, screen_width,
 					    "%s (%s){%s%s}", argbuf,
 					    pp->ki_comm, pp->ki_tdname,
 					    pp->ki_moretdname);
 				else
 					snprintf(cmdbuf, screen_width,
 					    "%s (%s)", argbuf, pp->ki_comm);
 			} else {
 				if (ps.thread && pp->ki_flag & P_HADTHREADS &&
 				    pp->ki_tdname[0])
 					snprintf(cmdbuf, screen_width,
 					    "%s{%s%s}", argbuf, pp->ki_tdname,
 					    pp->ki_moretdname);
 				else
 					strlcpy(cmdbuf, argbuf, screen_width);
 			}
 			free(argbuf);
 		}
 	}
 
 	if (displaymode == DISP_IO) {
 		oldp = get_old_proc(pp);
 		if (oldp != NULL) {
 			ru.ru_inblock = RU(pp)->ru_inblock -
 			    RU(oldp)->ru_inblock;
 			ru.ru_oublock = RU(pp)->ru_oublock -
 			    RU(oldp)->ru_oublock;
 			ru.ru_majflt = RU(pp)->ru_majflt - RU(oldp)->ru_majflt;
 			ru.ru_nvcsw = RU(pp)->ru_nvcsw - RU(oldp)->ru_nvcsw;
 			ru.ru_nivcsw = RU(pp)->ru_nivcsw - RU(oldp)->ru_nivcsw;
 			rup = &ru;
 		} else {
 			rup = RU(pp);
 		}
 		p_tot = rup->ru_inblock + rup->ru_oublock + rup->ru_majflt;
 		s_tot = total_inblock + total_oublock + total_majflt;
 
 		sbuf_printf(procbuf, "%5d ", (ps.thread_id) ? pp->ki_tid : pp->ki_pid);
 
 		if (ps.jail) {
 			sbuf_printf(procbuf, "%*d ", TOP_JID_LEN - 1, pp->ki_jid);
 		}
 		sbuf_printf(procbuf, "%-*.*s", namelength, namelength, (*get_userid)(pp->ki_ruid));
 		sbuf_printf(procbuf, "%6ld ", rup->ru_nvcsw);
 		sbuf_printf(procbuf, "%6ld ", rup->ru_nivcsw);
 		sbuf_printf(procbuf, "%6ld ", rup->ru_inblock);
 		sbuf_printf(procbuf, "%6ld ", rup->ru_oublock);
 		sbuf_printf(procbuf, "%6ld ", rup->ru_majflt);
 		sbuf_printf(procbuf, "%6ld ", p_tot);
 		sbuf_printf(procbuf, "%6.2f%% ", s_tot == 0 ? 0.0 : (p_tot * 100.0 / s_tot));
 
 	} else {
 		sbuf_printf(procbuf, "%5d ", (ps.thread_id) ? pp->ki_tid : pp->ki_pid);
 		if (ps.jail) {
 			sbuf_printf(procbuf, "%*d ", TOP_JID_LEN - 1, pp->ki_jid);
 		}
 		sbuf_printf(procbuf, "%-*.*s ", namelength, namelength, (*get_userid)(pp->ki_ruid));
 
 		if (!ps.thread) {
 			sbuf_printf(procbuf, "%4d ", pp->ki_numthreads);
 		} else {
 			sbuf_printf(procbuf, " ");
 		}
 
 		sbuf_printf(procbuf, "%3d ", pp->ki_pri.pri_level - PZERO);
 		sbuf_printf(procbuf, "%4s", format_nice(pp));
 		sbuf_printf(procbuf, "%7s ", format_k(PROCSIZE(pp)));
 		sbuf_printf(procbuf, "%6s ", format_k(pagetok(pp->ki_rssize)));
 		if (ps.swap) {
 			sbuf_printf(procbuf, "%*s ",
 				TOP_SWAP_LEN - 1,
 				format_k(pagetok(ki_swap(pp))));
 		}
 		sbuf_printf(procbuf, "%-6.6s ", status);
 		if (smpmode) {
 			int cpu;
 			if (state == SRUN && pp->ki_oncpu != NOCPU) {
 				cpu = pp->ki_oncpu;
 			} else {
 				cpu = pp->ki_lastcpu;
 			}
 			sbuf_printf(procbuf, "%3d ", cpu);
 		}
 		sbuf_printf(procbuf, "%6s ", format_time(cputime));
 		sbuf_printf(procbuf, "%6.2f%% ", ps.wcpu ? 100.0 * weighted_cpu(PCTCPU(pp), pp) : 100.0 * PCTCPU(pp));
 	}
-	sbuf_printf(procbuf, "%s", printable(cmdbuf));
+	sbuf_printf(procbuf, "%s", cmdbuf);
 	free(cmdbuf);
 	return (sbuf_data(procbuf));
 }
 
 static void
 getsysctl(const char *name, void *ptr, size_t len)
 {
 	size_t nlen = len;
 
 	if (sysctlbyname(name, ptr, &nlen, NULL, 0) == -1) {
 		fprintf(stderr, "top: sysctl(%s...) failed: %s\n", name,
 		    strerror(errno));
 		quit(TOP_EX_SYS_ERROR);
 	}
 	if (nlen != len) {
 		fprintf(stderr, "top: sysctl(%s...) expected %lu, got %lu\n",
 		    name, (unsigned long)len, (unsigned long)nlen);
 		quit(TOP_EX_SYS_ERROR);
 	}
 }
 
 static const char *
 format_nice(const struct kinfo_proc *pp)
 {
 	const char *fifo, *kproc;
 	int rtpri;
 	static char nicebuf[4 + 1];
 
 	fifo = PRI_NEED_RR(pp->ki_pri.pri_class) ? "" : "F";
 	kproc = (pp->ki_flag & P_KPROC) ? "k" : "";
 	switch (PRI_BASE(pp->ki_pri.pri_class)) {
 	case PRI_ITHD:
 		return ("-");
 	case PRI_REALTIME:
 		/*
 		 * XXX: the kernel doesn't tell us the original rtprio and
 		 * doesn't really know what it was, so to recover it we
 		 * must be more chummy with the implementation than the
 		 * implementation is with itself.  pri_user gives a
 		 * constant "base" priority, but is only initialized
 		 * properly for user threads.  pri_native gives what the
 		 * kernel calls the "base" priority, but it isn't constant
 		 * since it is changed by priority propagation.  pri_native
 		 * also isn't properly initialized for all threads, but it
 		 * is properly initialized for kernel realtime and idletime
 		 * threads.  Thus we use pri_user for the base priority of
 		 * user threads (it is always correct) and pri_native for
 		 * the base priority of kernel realtime and idletime threads
 		 * (there is nothing better, and it is usually correct).
 		 *
 		 * The field width and thus the buffer are too small for
 		 * values like "kr31F", but such values shouldn't occur,
 		 * and if they do then the tailing "F" is not displayed.
 		 */
 		rtpri = ((pp->ki_flag & P_KPROC) ? pp->ki_pri.pri_native :
 		    pp->ki_pri.pri_user) - PRI_MIN_REALTIME;
 		snprintf(nicebuf, sizeof(nicebuf), "%sr%d%s",
 		    kproc, rtpri, fifo);
 		break;
 	case PRI_TIMESHARE:
 		if (pp->ki_flag & P_KPROC)
 			return ("-");
 		snprintf(nicebuf, sizeof(nicebuf), "%d", pp->ki_nice - NZERO);
 		break;
 	case PRI_IDLE:
 		/* XXX: as above. */
 		rtpri = ((pp->ki_flag & P_KPROC) ? pp->ki_pri.pri_native :
 		    pp->ki_pri.pri_user) - PRI_MIN_IDLE;
 		snprintf(nicebuf, sizeof(nicebuf), "%si%d%s",
 		    kproc, rtpri, fifo);
 		break;
 	default:
 		return ("?");
 	}
 	return (nicebuf);
 }
 
 /* comparison routines for qsort */
 
 static int
 compare_pid(const void *p1, const void *p2)
 {
 	const struct kinfo_proc * const *pp1 = p1;
 	const struct kinfo_proc * const *pp2 = p2;
 
 	assert((*pp2)->ki_pid >= 0 && (*pp1)->ki_pid >= 0);
 
 	return ((*pp1)->ki_pid - (*pp2)->ki_pid);
 }
 
 static int
 compare_tid(const void *p1, const void *p2)
 {
 	const struct kinfo_proc * const *pp1 = p1;
 	const struct kinfo_proc * const *pp2 = p2;
 
 	assert((*pp2)->ki_tid >= 0 && (*pp1)->ki_tid >= 0);
 
 	return ((*pp1)->ki_tid - (*pp2)->ki_tid);
 }
 
 /*
  *  proc_compare - comparison function for "qsort"
  *	Compares the resource consumption of two processes using five
  *	distinct keys.  The keys (in descending order of importance) are:
  *	percent cpu, cpu ticks, state, resident set size, total virtual
  *	memory usage.  The process states are ordered as follows (from least
  *	to most important):  WAIT, zombie, sleep, stop, start, run.  The
  *	array declaration below maps a process state index into a number
  *	that reflects this ordering.
  */
 
 static int sorted_state[] = {
 	0,	/* not used		*/
 	3,	/* sleep		*/
 	1,	/* ABANDONED (WAIT)	*/
 	6,	/* run			*/
 	5,	/* start		*/
 	2,	/* zombie		*/
 	4	/* stop			*/
 };
 
 
 #define ORDERKEY_PCTCPU(a, b) do { \
 	double diff; \
 	if (ps.wcpu) \
 		diff = weighted_cpu(PCTCPU((b)), (b)) - \
 		    weighted_cpu(PCTCPU((a)), (a)); \
 	else \
 		diff = PCTCPU((b)) - PCTCPU((a)); \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 #define ORDERKEY_CPTICKS(a, b) do { \
 	int64_t diff = (int64_t)(b)->ki_runtime - (int64_t)(a)->ki_runtime; \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 #define ORDERKEY_STATE(a, b) do { \
 	int diff = sorted_state[(unsigned char)(b)->ki_stat] - sorted_state[(unsigned char)(a)->ki_stat]; \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 #define ORDERKEY_PRIO(a, b) do { \
 	int diff = (int)(b)->ki_pri.pri_level - (int)(a)->ki_pri.pri_level; \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 #define	ORDERKEY_THREADS(a, b) do { \
 	int diff = (int)(b)->ki_numthreads - (int)(a)->ki_numthreads; \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 #define ORDERKEY_RSSIZE(a, b) do { \
 	long diff = (long)(b)->ki_rssize - (long)(a)->ki_rssize; \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 #define ORDERKEY_MEM(a, b) do { \
 	long diff = (long)PROCSIZE((b)) - (long)PROCSIZE((a)); \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 #define ORDERKEY_JID(a, b) do { \
 	int diff = (int)(b)->ki_jid - (int)(a)->ki_jid; \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 #define ORDERKEY_SWAP(a, b) do { \
 	int diff = (int)ki_swap(b) - (int)ki_swap(a); \
 	if (diff != 0) \
 		return (diff > 0 ? 1 : -1); \
 } while (0)
 
 /* compare_cpu - the comparison function for sorting by cpu percentage */
 
 static int
 compare_cpu(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 
 	ORDERKEY_PCTCPU(p1, p2);
 	ORDERKEY_CPTICKS(p1, p2);
 	ORDERKEY_STATE(p1, p2);
 	ORDERKEY_PRIO(p1, p2);
 	ORDERKEY_RSSIZE(p1, p2);
 	ORDERKEY_MEM(p1, p2);
 
 	return (0);
 }
 
 /* compare_size - the comparison function for sorting by total memory usage */
 
 static int
 compare_size(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 
 	ORDERKEY_MEM(p1, p2);
 	ORDERKEY_RSSIZE(p1, p2);
 	ORDERKEY_PCTCPU(p1, p2);
 	ORDERKEY_CPTICKS(p1, p2);
 	ORDERKEY_STATE(p1, p2);
 	ORDERKEY_PRIO(p1, p2);
 
 	return (0);
 }
 
 /* compare_res - the comparison function for sorting by resident set size */
 
 static int
 compare_res(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 
 	ORDERKEY_RSSIZE(p1, p2);
 	ORDERKEY_MEM(p1, p2);
 	ORDERKEY_PCTCPU(p1, p2);
 	ORDERKEY_CPTICKS(p1, p2);
 	ORDERKEY_STATE(p1, p2);
 	ORDERKEY_PRIO(p1, p2);
 
 	return (0);
 }
 
 /* compare_time - the comparison function for sorting by total cpu time */
 
 static int
 compare_time(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const  *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *) arg2;
 
 	ORDERKEY_CPTICKS(p1, p2);
 	ORDERKEY_PCTCPU(p1, p2);
 	ORDERKEY_STATE(p1, p2);
 	ORDERKEY_PRIO(p1, p2);
 	ORDERKEY_RSSIZE(p1, p2);
 	ORDERKEY_MEM(p1, p2);
 
 	return (0);
 }
 
 /* compare_prio - the comparison function for sorting by priority */
 
 static int
 compare_prio(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 
 	ORDERKEY_PRIO(p1, p2);
 	ORDERKEY_CPTICKS(p1, p2);
 	ORDERKEY_PCTCPU(p1, p2);
 	ORDERKEY_STATE(p1, p2);
 	ORDERKEY_RSSIZE(p1, p2);
 	ORDERKEY_MEM(p1, p2);
 
 	return (0);
 }
 
 /* compare_threads - the comparison function for sorting by threads */
 static int
 compare_threads(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 
 	ORDERKEY_THREADS(p1, p2);
 	ORDERKEY_PCTCPU(p1, p2);
 	ORDERKEY_CPTICKS(p1, p2);
 	ORDERKEY_STATE(p1, p2);
 	ORDERKEY_PRIO(p1, p2);
 	ORDERKEY_RSSIZE(p1, p2);
 	ORDERKEY_MEM(p1, p2);
 
 	return (0);
 }
 
 /* compare_jid - the comparison function for sorting by jid */
 static int
 compare_jid(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 
 	ORDERKEY_JID(p1, p2);
 	ORDERKEY_PCTCPU(p1, p2);
 	ORDERKEY_CPTICKS(p1, p2);
 	ORDERKEY_STATE(p1, p2);
 	ORDERKEY_PRIO(p1, p2);
 	ORDERKEY_RSSIZE(p1, p2);
 	ORDERKEY_MEM(p1, p2);
 
 	return (0);
 }
 
 /* compare_swap - the comparison function for sorting by swap */
 static int
 compare_swap(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 
 	ORDERKEY_SWAP(p1, p2);
 	ORDERKEY_PCTCPU(p1, p2);
 	ORDERKEY_CPTICKS(p1, p2);
 	ORDERKEY_STATE(p1, p2);
 	ORDERKEY_PRIO(p1, p2);
 	ORDERKEY_RSSIZE(p1, p2);
 	ORDERKEY_MEM(p1, p2);
 
 	return (0);
 }
 
 /* assorted comparison functions for sorting by i/o */
 
 static int
 compare_iototal(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc * const p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc * const p2 = *(const struct kinfo_proc * const *)arg2;
 
 	return (get_io_total(p2) - get_io_total(p1));
 }
 
 static int
 compare_ioread(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 	long dummy, inp1, inp2;
 
 	(void) get_io_stats(p1, &inp1, &dummy, &dummy, &dummy, &dummy);
 	(void) get_io_stats(p2, &inp2, &dummy, &dummy, &dummy, &dummy);
 
 	return (inp2 - inp1);
 }
 
 static int
 compare_iowrite(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 	long dummy, oup1, oup2;
 
 	(void) get_io_stats(p1, &dummy, &oup1, &dummy, &dummy, &dummy);
 	(void) get_io_stats(p2, &dummy, &oup2, &dummy, &dummy, &dummy);
 
 	return (oup2 - oup1);
 }
 
 static int
 compare_iofault(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 	long dummy, flp1, flp2;
 
 	(void) get_io_stats(p1, &dummy, &dummy, &flp1, &dummy, &dummy);
 	(void) get_io_stats(p2, &dummy, &dummy, &flp2, &dummy, &dummy);
 
 	return (flp2 - flp1);
 }
 
 static int
 compare_vcsw(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 	long dummy, flp1, flp2;
 
 	(void) get_io_stats(p1, &dummy, &dummy, &dummy, &flp1, &dummy);
 	(void) get_io_stats(p2, &dummy, &dummy, &dummy, &flp2, &dummy);
 
 	return (flp2 - flp1);
 }
 
 static int
 compare_ivcsw(const void *arg1, const void *arg2)
 {
 	const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1;
 	const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2;
 	long dummy, flp1, flp2;
 
 	(void) get_io_stats(p1, &dummy, &dummy, &dummy, &dummy, &flp1);
 	(void) get_io_stats(p2, &dummy, &dummy, &dummy, &dummy, &flp2);
 
 	return (flp2 - flp1);
 }
 
 int (*compares[])(const void *arg1, const void *arg2) = {
 	compare_cpu,
 	compare_size,
 	compare_res,
 	compare_time,
 	compare_prio,
 	compare_threads,
 	compare_iototal,
 	compare_ioread,
 	compare_iowrite,
 	compare_iofault,
 	compare_vcsw,
 	compare_ivcsw,
 	compare_jid,
 	compare_swap,
 	NULL
 };
 
 
 static int
 swapmode(int *retavail, int *retfree)
 {
 	int n;
 	struct kvm_swap swapary[1];
 	static int pagesize = 0;
 	static unsigned long swap_maxpages = 0;
 
 	*retavail = 0;
 	*retfree = 0;
 
 #define CONVERT(v)	((quad_t)(v) * pagesize / 1024)
 
 	n = kvm_getswapinfo(kd, swapary, 1, 0);
 	if (n < 0 || swapary[0].ksw_total == 0)
 		return (0);
 
 	if (pagesize == 0)
 		pagesize = getpagesize();
 	if (swap_maxpages == 0)
 		GETSYSCTL("vm.swap_maxpages", swap_maxpages);
 
 	/* ksw_total contains the total size of swap all devices which may
 	   exceed the maximum swap size allocatable in the system */
 	if ( swapary[0].ksw_total > swap_maxpages )
 		swapary[0].ksw_total = swap_maxpages;
 
 	*retavail = CONVERT(swapary[0].ksw_total);
 	*retfree = CONVERT(swapary[0].ksw_total - swapary[0].ksw_used);
 
 #undef CONVERT
 
 	n = (int)(swapary[0].ksw_used * 100.0 / swapary[0].ksw_total);
 	return (n);
 }
Index: projects/clang900-import/usr.bin/top/top.1
===================================================================
--- projects/clang900-import/usr.bin/top/top.1	(revision 352586)
+++ projects/clang900-import/usr.bin/top/top.1	(revision 352587)
@@ -1,445 +1,454 @@
 .\" $FreeBSD$
-.Dd October 2, 2018
+.Dd September 21, 2019
 .Dt TOP 1
 .Os
 .Sh NAME
 .Nm top
 .Nd display and update information about the top cpu processes
 .Sh SYNOPSIS
 .Nm
 .Op Fl CHIPSTabijnpqtuvxz
 .Op Fl J Ar jail
 .Op Fl U Ar uid
 .Op Fl d Ar count
 .Op Fl m Ar cpu|io
 .Op Fl s Ar time
 .Op Fl o Ar field
 .Op Fl p Ar pid
 .Op Ar count
 .Sh DESCRIPTION
 .Nm
 displays the top
 processes on the system and periodically updates this information.
 If standard output is an intelligent terminal (see below) then
 as many processes as will fit on the terminal screen are displayed
 by default.
 Otherwise, a good number of them are shown (around 20).
 Raw cpu percentage is used to rank the processes.
 If
 .Ar number
 is given, then the top
 .Ar number
 processes will be displayed instead of the default.
 .Pp
 .Nm
 makes a distinction between terminals that support advanced capabilities
 and those that do not.
 This distinction affects the choice of defaults for certain options.
 In the remainder of this document, an \*(lqintelligent\*(rq terminal is one that
 supports cursor addressing, clear screen, and clear to end of line.
 Conversely, a \*(lqdumb\*(rq terminal is one that does not support such
 features.
 If the output of
 .Nm
 is redirected to a file, it acts as if it were being run on a dumb
 terminal.
 .Bl -tag -width indent -compact
 .It Fl C
 Toggle CPU display mode.
 By default top displays the weighted CPU percentage in the WCPU column
 (this is the same value that
 .Xr ps 1
 displays as CPU).
 Each time
 .Fl C
 flag is passed it toggles between \*(lqraw cpu\*(rq mode
 and \*(lqweighted cpu\*(rq mode, showing the \*(lqCPU\*(rq or
 the \*(lqWCPU\*(rq column respectively.
 .It Fl S
 Show system processes in the display.
 Normally, system processes such as the pager and the swapper are not shown.
 This option makes them visible.
 .It Fl a
 Display command names derived from the argv[] vector, rather than real
 executable name.
 It it useful when you want to watch applications, that
 puts their status information there.
 If the real name differs from argv[0],
 it will be displayed in parenthesis.
+Non-printable characters in the command line are
+encoded in C-style backslash sequences or
+a three digit octal sequences.
 .It Fl b
 Use \*(lqbatch\*(rq mode.
 In this mode, all input from the terminal is
 ignored.
 Interrupt characters (such as ^C and ^\e) still have an effect.
 This is the default on a dumb terminal, or when the output is not a terminal.
 .It Fl H
 Display each thread for a multithreaded process individually.
 By default a single summary line is displayed for each process.
 .It Fl i
 Use \*(lqinteractive\*(rq mode.
 In this mode, any input is immediately
 read for processing.
 See the section on \*(lqInteractive Mode\*(rq
 for an explanation of
 which keys perform what functions.
 After the command is processed, the
 screen will immediately be updated, even if the command was not
 understood.
 This mode is the default when standard output is an
 intelligent terminal.
 .It Fl I
 Do not display idle processes.
 By default, top displays both active and idle processes.
 .It Fl j
 Display the
 .Xr jail 8
 ID.
 .It Fl T
 Toggle displaying thread ID (tid) instead of process id (pid).
 .It Fl t
 Do not display the
 .Nm
 process itself.
 .It Fl display
 Display either 'cpu' or 'io' statistics.
 Default is 'cpu'.
 .It Fl n
 Use \*(lqnon-interactive\*(rq mode.
 This is identical to \*(lqbatch\*(rq
 mode.
 .It Fl P
 Display per-cpu CPU usage statistics.
 .It Fl q
 Renice
 .Nm
 to -20 so that it will run faster.
 This can be used when the system is
 being very sluggish to improve the possibility of discovering the problem.
 This option can only be used by root.
 .It Fl u
 Do not map uid numbers to usernames.
 Normally,
 .Nm
 will read as much of the file \*(lq/etc/passwd\*(rq as is necessary to map
 all the user id numbers it encounters into login names.
 This option disables all that, while possibly decreasing execution time.
 The uid numbers are displayed instead of the names.
 .It Fl v
 Write version number information to stderr then exit immediately.
 .It Fl w
 Display approximate swap usage for each process.
 .It Fl z
 Do not display the system idle process.
 .It Fl d Ar count
 Show only
 .Ar count
 displays, then exit.
 A display is considered to be one update of the
 screen.
 The default is 1 for dumb terminals.
 Note that for
 .Ar count
 = 1
 no information is available about the percentage of time spent by the CPU in every state.
 .It Fl s Ar time
 Set the delay between screen updates to
 .Ar time
 seconds.
 The default delay between updates is 1 second.
 .It Fl o Ar field
 Sort the process display area on the specified field.
 The field name
 is the name of the column as seen in the output, but in lower case:
 \*(lqcpu\*(lq, \*(rqsize\*(lq, \*(rqres\*(lq, \*(rqtime\*(lq,
 \*(rqpri\*(lq, \*(rqthreads\*(lq, \*(lqtotal\*(lq, \*(rqread\*(lq,
 \*(rqwrite\*(lq, \*(rqfault\*(lq, \*(rqvcsw\*(lq, \*(rqivcsw\*(lq,
 \*(lqjid\*(lq, \*(rqswap\*(lq or \*(rqpid\*(lq.
 .It Fl p Ar pid
 Show only the process
 .Ar pid .
 .It Fl J Ar jail
 Show only those processes owned by
 .Ar jail .
 This may be either the
 .Ar jid
 or
 .Ar name
 of the jail.
 Use
 0
 to limit to host processes.
 Using this option implies
 .Fl j .
 .Pp
 .It Fl U Ar username
 Show only those processes owned by
 .Ar username .
 This option currently only accepts usernames and will not understand
 uid numbers.
 .El
 .Pp
 Both
 .Ar count
 and
 .Ar number
 fields can be specified as \*(lqinfinite\*(rq, indicating that they can
 stretch as far as possible.
 This is accomplished by using any proper
 prefix of the keywords
 \*(lqinfinity\*(rq,
 \*(lqmaximum\*(rq,
 or
 \*(lqall\*(rq.
 Boolean flags are toggles.
 A second specification of any of these options will negate the first.
 .Sh "INTERACTIVE MODE"
 When
 .Nm
 is running in \*(lqinteractive mode\*(rq, it reads commands from the
 terminal and acts upon them accordingly.
 In this mode, the terminal is
 put in \*(lqCBREAK\*(rq, so that a character will be
 processed as soon as it is typed.
 Almost always, a key will be
 pressed when
 .Nm
 is between displays; that is, while it is waiting for
 .Ar time
 seconds to elapse.
 If this is the case, the command will be
 processed and the display will be updated immediately thereafter
 (reflecting any changes that the command may have specified).
 This
 happens even if the command was incorrect.
 If a key is pressed while
 .Nm
 is in the middle of updating the display, it will finish the update and
 then process the command.
 Some commands require additional information,
 and the user will be prompted accordingly.
 While typing this information
 in, the user's erase and kill keys (as set up by the command
 .Xr stty 1 )
 are recognized, and a newline terminates the input.
 .Pp
 These commands are currently recognized (^L refers to control-L):
 .Bl -tag -width indent
 .It ^L
 Redraw the screen.
 .It h
 Display a summary of the commands (help screen).
 Version information
 is included in this display.
 .It q
 Quit
 .Nm
 .It d
 Change the number of displays to show (prompt for new number).
 Remember that the next display counts as one, so typing
 .It d1
 will make
 .Nm
 show one final display and then immediately exit.
 .It m
 Toggle the display between 'cpu' and 'io' modes.
 .It n or #
 Change the number of processes to display (prompt for new number).
 .It s
 Change the number of seconds to delay between displays
 (prompt for new number).
 .It S
 Toggle the display of system processes.
 .It a
 Toggle the display of process titles.
 .It k
 Send a signal (\*(lqkill\*(rq by default) to a list of processes.
 This acts similarly to the command
 .Xr kill 1 .
 .It r
 Change the priority (the \*(lqnice\*(rq) of a list of processes.
 This acts similarly to
 .Xr renice 8 .
 .It u
 Display only processes owned by a specific set of usernames (prompt for
 username).
 If the username specified is simply \*(lq+\*(rq or \*(lq-\*(rq,
 then processes belonging to all users will be displayed.
 Usernames can be added
 to and removed from the set by prepending them with \*(lq+\*(rq and
 \*(lq-\*(rq, respectively.
 .It o
 Change the order in which the display is sorted.
 The sort key names include
 \*(lqcpu\*(rq, \*(lqres\*(rq, \*(lqsize\*(rq,
 \*(lqtime\*(rq.
 The default is cpu.
 .It p
 Display a specific process (prompt for pid).
 If the pid specified is simply \*(lq+\*(rq, then show all processes.
 .It e
 Display a list of system errors (if any) generated by the last
 command.
 .It B H
 Toggle the display of threads.
 .It i or I
 Toggle the display of idle processes.
 .It j
 Toggle the display of
 .Xr jail 8
 ID.
 .It J
 Display only processes owned by a specific jail (prompt for jail).
 If the jail specified is simply \*(lq+\*(rq, then processes belonging
 to all jails and the host will be displayed.
 This will also enable the display of JID.
 .It P
 Toggle the display of per-CPU statistics.
 .It T
 Toggle display of TID and PID
 .It t
 Toggle the display of the
 .Nm
 process.
 .It w
 Toggle the display of swap usage.
 .It z
 Toggle the display of the system idle process.
 .El
 .Sh "THE DISPLAY"
 The top few lines of the display show general information
 about the state of the system, including
 the last process id assigned to a process (on most systems),
 the three load averages,
 the current time,
 the number of existing processes,
 the number of processes in each state
 (sleeping, running, starting, zombies, and stopped),
 and a percentage of time spent in each of the processor states
 (user, nice, system, and idle).
 It also includes information about physical and virtual memory allocation.
 .Pp
 The remainder of the screen displays information about individual
 processes.
 This display is similar in spirit to
 .Xr ps 1
 but it is not exactly the same.
 PID is the process id,
 JID, when displayed, is the
 .Xr jail 8
 ID corresponding to the process,
 USERNAME is the name of the process's owner (if
 .Fl u
 is specified, a UID column will be substituted for USERNAME),
 PRI is the current priority of the process,
 NICE is the
 .Xr nice 1
 amount,
 SIZE is the total size of the process (text, data, and stack),
 RES is the current amount of resident memory,
 SWAP is the approximate amount of swap, if enabled
 (SIZE, RES and SWAP are given in kilobytes),
 STATE is the current state (one of \*(lqSTART\*(rq, \*(lqRUN\*(rq
 (shown as \*(lqCPUn\*(rq on SMP systems), \*(lqSLEEP\*(rq, \*(lqSTOP\*(rq,
 \*(lqZOMB\*(rq, \*(lqWAIT\*(rq, \*(lqLOCK\*(rq or the event on which the
 process waits),
 C is the processor number on which the process is executing
 (visible only on SMP systems),
 TIME is the number of system and user cpu seconds that the process has used,
 WCPU, when displayed, is the weighted cpu percentage (this is the same
 value that
 .Xr ps 1
 displays as CPU),
 CPU is the raw percentage and is the field that is sorted to determine
 the order of the processes, and
 COMMAND is the name of the command that the process is currently running
 (if the process is swapped out, this column is marked \*(lq<swapped>\*(rq).
 .Pp
 If a process is in the \*(lqSLEEP\*(rq or \*(lqLOCK\*(rq state,
 the state column will report the name of the event or lock on which the
 process is waiting.
 Lock names are prefixed with an asterisk \*(lq*\*(rq while sleep events
 are not.
 .Sh DESCRIPTION OF MEMORY
 .Bd -literal
 Mem: 61M Active, 86M Inact, 368K Laundry, 22G Wired, 102G Free
 ARC: 15G Total, 9303M MFU, 6155M MRU, 1464K Anon, 98M Header, 35M Other
      15G Compressed, 27G Uncompressed, 1.75:1 Ratio, 174M Overhead
 Swap: 4096M Total, 532M Free, 13% Inuse, 80K In, 104K Out
 .Ed
 .Ss Physical Memory Stats
 .Bl -tag -width "Uncompressed" -compact
 .It Em Active
 number of bytes active
 .It Em Inact
 number of clean bytes inactive
 .It Em Laundry
 number of dirty bytes queued for laundering
 .It Em Wired
 number of bytes wired down, including IO-level cached file data pages
 .It Em Buf
 number of bytes used for IO-level disk caching
 .It Em Free
 number of bytes free
 .El
 .Ss ZFS ARC Stats
 These stats are only displayed when the ARC is in use.
 .Pp
 .Bl -tag -width "Uncompressed" -compact
 .It Em Total
 number of wired bytes used for the ZFS ARC
 .It Em MRU
 number of ARC bytes holding most recently used data
 .It Em MFU
 number of ARC bytes holding most frequently used data
 .It Em Anon
 number of ARC bytes holding in flight data
 .It Em Header
 number of ARC bytes holding headers
 .It Em Other
 miscellaneous ARC bytes
 .It Em Compressed
 bytes of memory used by ARC caches
 .It Em Uncompressed
 bytes of data stored in ARC caches before compression
 .It Em Ratio
 compression ratio of data cached in the ARC
 .El
 .Ss Swap Stats
 .Bl -tag -width "Uncompressed" -compact
 .It Em Total
 total available swap usage
 .It Em Free
 total free swap usage
 .It Em Inuse
 swap usage
 .It Em \&In
 bytes paged in from swap devices (last interval)
 .It Em Out
 bytes paged out to swap devices (last interval)
 .El
 .Sh ENVIRONMENT
 .Bl -tag -width "Uncompressed"
 .It Ev TOP
 Default set of arguments to
 .Nm .
+.It Ev LC_CTYPE
+The locale to use when displaying the
+.Va argv
+vector when
+.Fl a
+flag is specified.
 .El
 .Sh SEE ALSO
 .Xr kill 1 ,
 .Xr ps 1 ,
 .Xr stty 1 ,
 .Xr getrusage 2 ,
 .Xr humanize_number 3 ,
 .Xr mem 4 ,
 .Xr renice 8
 .Sh AUTHORS
 .An William LeFebvre, EECS Department, Northwestern University
 .Sh BUGS
 The command name for swapped processes should be tracked down, but this
 would make the program run slower.
 .Pp
 As with
 .Xr ps 1 ,
 things can change while
 .Nm
 is collecting information for an update.
 The picture it gives is only a close approximation to reality.
Index: projects/clang900-import/usr.bin/top/top.c
===================================================================
--- projects/clang900-import/usr.bin/top/top.c	(revision 352586)
+++ projects/clang900-import/usr.bin/top/top.c	(revision 352587)
@@ -1,1198 +1,1204 @@
 /*-
  *  Top users/processes display for Unix
  *
  *  This program may be freely redistributed,
  *  but this entire comment MUST remain intact.
  *
  *  Copyright (c) 1984, 1989, William LeFebvre, Rice University
  *  Copyright (c) 1989 - 1994, William LeFebvre, Northwestern University
  *  Copyright (c) 1994, 1995, William LeFebvre, Argonne National Laboratory
  *  Copyright (c) 1996, William LeFebvre, Group sys Consulting
  *
  * $FreeBSD$
  */
 
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/cdefs.h>
 #include <sys/limits.h>
 #include <sys/resource.h>
 #include <sys/select.h>
 #include <sys/signal.h>
 
 #include <assert.h>
 #include <err.h>
 #include <errno.h>
 #include <getopt.h>
 #include <jail.h>
+#include <locale.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <signal.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "commands.h"
 #include "display.h"		/* interface to display package */
 #include "screen.h"		/* interface to screen package */
 #include "top.h"
 #include "machine.h"
 #include "utils.h"
 #include "username.h"
 
 /* Size of the stdio buffer given to stdout */
 #define Buffersize	2048
 
 char copyright[] =
     "Copyright (c) 1984 through 1996, William LeFebvre";
 
 typedef void sigret_t;
 
 /* The buffer that stdio will use */
 static char stdoutbuf[Buffersize];
 
 static int fmt_flags = 0;
 int pcpu_stats = false;
 
 /* signal handling routines */
 static sigret_t leave(int);
 static sigret_t tstop(int);
 static sigret_t top_winch(int);
 
 static volatile sig_atomic_t leaveflag;
 static volatile sig_atomic_t tstopflag;
 static volatile sig_atomic_t winchflag;
 
 /* values which need to be accessed by signal handlers */
 static int max_topn;		/* maximum displayable processes */
 
 /* miscellaneous things */
 struct process_select ps;
 pid_t mypid;
 
 /* pointers to display routines */
 static void (*d_loadave)(int mpid, double *avenrun) = i_loadave;
 static void (*d_procstates)(int total, int *brkdn) = i_procstates;
 static void (*d_cpustates)(int *states) = i_cpustates;
 static void (*d_memory)(int *stats) = i_memory;
 static void (*d_arc)(int *stats) = i_arc;
 static void (*d_carc)(int *stats) = i_carc;
 static void (*d_swap)(int *stats) = i_swap;
 static void (*d_message)(void) = i_message;
 static void (*d_header)(const char *text) = i_header;
 static void (*d_process)(int line, char *thisline) = i_process;
 
 static void reset_display(void);
 
 static const struct option longopts[] = {
     { "cpu-display-mode", no_argument, NULL, 'C' }, /* differs from orignal */
     /* D reserved */
     { "thread", no_argument, NULL, 'H' },
     { "idle-procs", no_argument, NULL, 'I' },
 	{ "jail", required_argument, NULL, 'J' },
 	{ "per-cpu", no_argument, NULL, 'P' },
     { "system-procs", no_argument, NULL, 'S' },
     { "thread-id", no_argument, NULL, 'T' }, /* differs from orignal */
     { "user", required_argument, NULL, 'U' },
     { "all", no_argument, NULL, 'a' },
     { "batch", no_argument, NULL, 'b' },
     /* c reserved */
     { "displays", required_argument, NULL, 'd' },
     { "interactive", no_argument, NULL, 'i' },
     { "jail-id", no_argument, NULL, 'j' },
     { "display-mode", required_argument, NULL, 'm' },
     /* n is identical to batch */
     { "sort-order", required_argument, NULL, 'o' },
     { "pid", required_argument, NULL, 'p' },
     { "quick", no_argument, NULL, 'q' },
     { "delay", required_argument, NULL, 's' },
     { "threads", no_argument, NULL, 't' },
     { "uids", no_argument, NULL, 'u' },
     { "version", no_argument, NULL, 'v' },
 	{ "swap", no_argument, NULL, 'w' },
 	{ "system-idle-procs", no_argument, NULL, 'z' },
 	{ NULL, 0, NULL, 0 }
 };
 
 static void
 reset_uids(void)
 {
     for (size_t i = 0; i < TOP_MAX_UIDS; ++i)
 	ps.uid[i] = -1;
 }
 
 static int
 add_uid(int uid)
 {
     size_t i = 0;
 
     /* Add the uid if there's room */
     for (; i < TOP_MAX_UIDS; ++i)
     {
 	if (ps.uid[i] == -1 || ps.uid[i] == uid)
 	{
 	    ps.uid[i] = uid;
 	    break;
 	}
     }
 
     return (i == TOP_MAX_UIDS);
 }
 
 static void
 rem_uid(int uid)
 {
     size_t i = 0;
     size_t where = TOP_MAX_UIDS;
 
     /* Look for the user to remove - no problem if it's not there */
     for (; i < TOP_MAX_UIDS; ++i)
     {
 	if (ps.uid[i] == -1)
 	    break;
 	if (ps.uid[i] == uid)
 	    where = i;
     }
 
     /* Make sure we don't leave a hole in the middle */
     if (where != TOP_MAX_UIDS)
     {
 	ps.uid[where] = ps.uid[i-1];
 	ps.uid[i-1] = -1;
     }
 }
 
 static int
 handle_user(char *buf, size_t buflen)
 {
     int rc = 0;
     int uid = -1;
     char *buf2 = buf;
 
     new_message(MT_standout, "Username to show (+ for all): ");
     if (readline(buf, buflen, false) <= 0)
     {
 	clear_message();
 	return (rc);
     }
 
     if (buf[0] == '+' || buf[0] == '-')
     {
 	if (buf[1] == '\0')
 	{
 	    reset_uids();
 	    goto end;
 	}
 	else
 	    ++buf2;
     }
 
     if ((uid = userid(buf2)) == -1)
     {
 	new_message(MT_standout, " %s: unknown user", buf2);
 	rc = 1;
 	goto end;
     }
 
     if (buf2 == buf)
     {
 	reset_uids();
 	ps.uid[0] = uid;
 	goto end;
     }
 
     if (buf[0] == '+')
     {
 	if (add_uid(uid))
 	{
 	    new_message(MT_standout, " too many users, reset with '+'");
 	    rc = 1;
 	    goto end;
 	}
     }
     else
 	rem_uid(uid);
 
 end:
     putchar('\r');
     return (rc);
 }
 
 int
 main(int argc, const char *argv[])
 {
     int i;
     int active_procs;
 
     struct system_info system_info;
     struct statics statics;
     void * processes;
 
     static char tempbuf1[50];
     static char tempbuf2[50];
 	sigset_t old_sigmask, new_sigmask;
     int topn = Infinity;
     double delay = 2;
     int displays = 0;		/* indicates unspecified */
     int sel_ret = 0;
     time_t curr_time;
     char *(*get_userid)(int) = username;
     const char *uname_field = "USERNAME";
     const char *header_text;
     char *env_top;
     const char **preset_argv;
     int  preset_argc = 0;
     const char **av = NULL;
     int  ac = -1;
     bool do_unames = true;
     char interactive = 2;
     char warnings = 0;
     char topn_specified = false;
     char ch;
     char no_command = 1;
     struct timeval timeout;
     char *order_name = NULL;
     int order_index = 0;
     fd_set readfds;
 	char *nptr;
 
     /* set the buffer for stdout */
 #ifdef DEBUG
     extern FILE *debug;
     debug = fopen("debug.run", "w");
     setbuffer(stdout, NULL, 0);
 #else
     setbuffer(stdout, stdoutbuf, Buffersize);
 #endif
+
+    if (setlocale(LC_ALL, "") == NULL) {
+        fprintf(stderr, "invalid locale.\n");
+	exit(1);
+    }
 
     mypid = getpid();
 
     /* get our name */
     /* initialize some selection options */
     ps.idle    = true;
     ps.self    = true;
     ps.system  = false;
     reset_uids();
     ps.thread  = false;
     ps.wcpu    = 1;
     ps.jid     = -1;
     ps.jail    = false;
     ps.swap    = false;
     ps.kidle   = true;
     ps.pid     = -1;
     ps.command = NULL;
     ps.thread_id = false;
 
     /* get preset options from the environment */
     if ((env_top = getenv("TOP")) != NULL)
     {
 	av = preset_argv = argparse(env_top, &preset_argc);
 	ac = preset_argc;
 
 	/* set the dummy argument to an explanatory message, in case
 	   getopt encounters a bad argument */
 	preset_argv[0] = "while processing environment";
     }
 
     /* process options */
     do {
 	/* if we're done doing the presets, then process the real arguments */
 	if (preset_argc == 0)
 	{
 	    ac = argc;
 	    av = argv;
 
 	    /* this should keep getopt happy... */
 	    optind = 1;
 	}
 
 	while ((i = getopt_long(ac, __DECONST(char * const *, av), "CSIHPabijJ:nquvzs:d:U:m:o:p:Ttw", longopts, NULL)) != EOF)
 	{
 	    switch(i)
 	    {
 	      case 'v':			/* show version number */
 			  errx(0, "version FreeBSD");
 			  break;
 
 	      case 'u':			/* toggle uid/username display */
 		do_unames = !do_unames;
 		break;
 
 	      case 'U':			/* display only username's processes */
 		if ((ps.uid[0] = userid(optarg)) == -1)
 		{
 		    errx(1, "%s: unknown user\n", optarg);
 		}
 		break;
 
 	      case 'S':			/* show system processes */
 		ps.system = true;
 		break;
 
 	      case 'I':                   /* show idle processes */
 		ps.idle = !ps.idle;
 		break;
 
 	      case 'i':			/* go interactive regardless */
 		interactive = 1;
 		break;
 
 	      case 'n':			/* batch, or non-interactive */
 	      case 'b':
 		interactive = 0;
 		break;
 
 	      case 'a':
 		fmt_flags ^= FMT_SHOWARGS;
 		break;
 
 	      case 'd':			/* number of displays to show */
 		if ((i = atoiwi(optarg)) == Invalid || i == 0)
 		{
 		    warnx("warning: display count should be positive -- option ignored");
 		    warnings++;
 		}
 		else
 		{
 		    displays = i;
 		}
 		break;
 	      case 'p': {
 		unsigned long long num;
 		const char *errstr;
 
 		num = strtonum(optarg, 0, INT_MAX, &errstr);
 		if (errstr != NULL || !find_pid(num)) {
 			fprintf(stderr, "%s: unknown pid\n", optarg);
 			exit(1);
 		}
 		ps.pid = (pid_t)num;
 		ps.system = true;
 		break;
 	      }
 
 		  case 's':
 			delay = strtod(optarg, &nptr);
 			if (nptr == optarg) {
 				warnx("warning: invalid delay");
 				delay = 2;
 				warnings++;
 			}
 			if (delay < 0) {
 				warnx("warning: seconds delay should be positive -- using default");
 				delay = 2;
 				warnings++;
 			}
 
 		break;
 
 	      case 'q':		/* be quick about it */
 			errno = 0;
 			i = setpriority(PRIO_PROCESS, 0, PRIO_MIN);
 			if (i == -1 && errno != 0) {
 				warnx("warning: `-q' option failed (%m)");
 				warnings++;
 			}
 		break;
 
 	      case 'm':		/* select display mode */
 		if (strcmp(optarg, "io") == 0) {
 			displaymode = DISP_IO;
 		} else if (strcmp(optarg, "cpu") == 0) {
 			displaymode = DISP_CPU;
 		} else {
 			errx(1, "warning: `-m' option can only take args 'io' or 'cpu'");
 		}
 		break;
 
 	      case 'o':		/* select sort order */
 		order_name = optarg;
 		break;
 
 	      case 't':
 		ps.self = !ps.self;
 		break;
 
 	      case 'C':
 		ps.wcpu = !ps.wcpu;
 		break;
 
 	      case 'H':
 		ps.thread = !ps.thread;
 		break;
 
 	      case 'T':
 		ps.thread_id = !ps.thread_id;
 		break;
 
 	      case 'j':
 		ps.jail = !ps.jail;
 		break;
 
 	      case 'J':			/* display only jail's processes */
 		if ((ps.jid = jail_getid(optarg)) == -1)
 		{
 		    fprintf(stderr, "%s: unknown jail\n", optarg);
 		    exit(1);
 		}
 		ps.jail = 1;
 		break;
 
 	      case 'P':
 		pcpu_stats = !pcpu_stats;
 		break;
 
 	      case 'w':
 		ps.swap = 1;
 		break;
 
 	      case 'z':
 		ps.kidle = !ps.kidle;
 		break;
 
 	      default:
 		errx(1, 
 "[-abCHIijnPqStuvwz] [-d count] [-m io | cpu] [-o field] [-p pid]\n"
 "       [-s time] [-J jail] [-U username] [number]");
 	    }
 	}
 
 	/* get count of top processes to display (if any) */
 	if (optind < ac)
 	{
 	    if ((topn = atoiwi(av[optind])) == Invalid)
 	    {
 			warnx("warning: process display count should be non-negative -- using default");
 			warnings++;
 	    }
             else
 	    {
 		topn_specified = true;
 	    }
 	}
 
 	/* tricky:  remember old value of preset_argc & set preset_argc = 0 */
 	i = preset_argc;
 	preset_argc = 0;
 
     /* repeat only if we really did the preset arguments */
     } while (i != 0);
 
     /* set constants for username/uid display correctly */
     if (!do_unames)
     {
 	uname_field = "   UID  ";
 	get_userid = itoa7;
     }
 
     /* initialize the kernel memory interface */
     if (machine_init(&statics) == -1)
     {
 	exit(1);
     }
 
     /* determine sorting order index, if necessary */
     if (order_name != NULL)
     {
 	if ((order_index = string_index(order_name, statics.order_names)) == -1)
 	{
 	    const char * const *pp;
 
 	    warnx("'%s' is not a recognized sorting order.", order_name);
 	    fprintf(stderr, "\tTry one of these:");
 	    pp = statics.order_names;
 	    while (*pp != NULL)
 	    {
 		fprintf(stderr, " %s", *pp++);
 	    }
 	    fputc('\n', stderr);
 	    exit(1);
 	}
     }
 
     /* initialize termcap */
     init_termcap(interactive);
 
     /* get the string to use for the process area header */
     header_text = format_header(uname_field);
 
     /* initialize display interface */
     if ((max_topn = display_init(&statics)) == -1)
     {
 		errx(4, "can't allocate sufficient memory");
     }
 
     /* print warning if user requested more processes than we can display */
     if (topn > max_topn)
     {
 		warnx("warning: this terminal can only display %d processes.", max_topn);
 		warnings++;
     }
 
     /* adjust for topn == Infinity */
     if (topn == Infinity)
     {
 	/*
 	 *  For smart terminals, infinity really means everything that can
 	 *  be displayed, or Largest.
 	 *  On dumb terminals, infinity means every process in the system!
 	 *  We only really want to do that if it was explicitly specified.
 	 *  This is always the case when "Default_TOPN != Infinity".  But if
 	 *  topn wasn't explicitly specified and we are on a dumb terminal
 	 *  and the default is Infinity, then (and only then) we use
 	 *  "Nominal_TOPN" instead.
 	 */
 	topn = smart_terminal ? Largest :
 		    (topn_specified ? Largest : Nominal_TOPN);
     }
 
     /* set header display accordingly */
     display_header(topn > 0);
 
     /* determine interactive state */
     if (interactive == 2)
     {
 	interactive = smart_terminal;
     }
 
     /* if # of displays not specified, fill it in */
     if (displays == 0)
     {
 	displays = smart_terminal ? Infinity : 1;
     }
 
     /* hold interrupt signals while setting up the screen and the handlers */
 
 	sigemptyset(&new_sigmask);
 	sigaddset(&new_sigmask, SIGINT);
 	sigaddset(&new_sigmask, SIGQUIT);
 	sigaddset(&new_sigmask, SIGTSTP);
 	sigprocmask(SIG_BLOCK, &new_sigmask, &old_sigmask);
     init_screen();
     signal(SIGINT, leave);
     signal(SIGQUIT, leave);
     signal(SIGTSTP, tstop);
     signal(SIGWINCH, top_winch);
     sigprocmask(SIG_SETMASK, &old_sigmask, NULL);
     if (warnings)
     {
 	fputs("....", stderr);
 	fflush(stderr);
 	sleep(3 * warnings);
 	fputc('\n', stderr);
     }
 
 restart:
 
     /*
      *  main loop -- repeat while display count is positive or while it
      *		indicates infinity (by being -1)
      */
 
     while ((displays == -1) || (displays-- > 0))
     {
 	int (*compare)(const void * const, const void * const);
 
 
 	/* get the current stats */
 	get_system_info(&system_info);
 
 	compare = compares[order_index];
 
 	/* get the current set of processes */
 	processes =
 		get_process_info(&system_info, &ps, compare);
 
 	/* display the load averages */
 	(*d_loadave)(system_info.last_pid,
 		     system_info.load_avg);
 
 	/* display the current time */
 	/* this method of getting the time SHOULD be fairly portable */
 	time(&curr_time);
 	i_uptime(&system_info.boottime, &curr_time);
 	i_timeofday(&curr_time);
 
 	/* display process state breakdown */
 	(*d_procstates)(system_info.p_total,
 			system_info.procstates);
 	(*d_cpustates)(system_info.cpustates);
 
 	/* display memory stats */
 	(*d_memory)(system_info.memory);
 	(*d_arc)(system_info.arc);
 	(*d_carc)(system_info.carc);
 
 	/* display swap stats */
 	(*d_swap)(system_info.swap);
 
 	/* handle message area */
 	(*d_message)();
 
 	/* update the header area */
 	(*d_header)(header_text);
 
 	if (topn > 0)
 	{
 	    /* determine number of processes to actually display */
 	    /* this number will be the smallest of:  active processes,
 	       number user requested, number current screen accomodates */
 	    active_procs = system_info.p_pactive;
 	    if (active_procs > topn)
 	    {
 		active_procs = topn;
 	    }
 	    if (active_procs > max_topn)
 	    {
 		active_procs = max_topn;
 	    }
 
 	    /* now show the top "n" processes. */
 	    for (i = 0; i < active_procs; i++)
 	    {
 		(*d_process)(i, format_next_process(processes, get_userid,
 			     fmt_flags));
 	    }
 	}
 	else
 	{
 	    i = 0;
 	}
 
 	/* do end-screen processing */
 	u_endscreen(i);
 
 	/* now, flush the output buffer */
 	if (fflush(stdout) != 0)
 	{
 	    new_message(MT_standout, " Write error on stdout");
 	    putchar('\r');
 	    quit(1);
 	}
 
 	/* only do the rest if we have more displays to show */
 	if (displays)
 	{
 	    /* switch out for new display on smart terminals */
 	    if (smart_terminal)
 	    {
 		if (overstrike)
 		{
 		    reset_display();
 		}
 		else
 		{
 		    d_loadave = u_loadave;
 		    d_procstates = u_procstates;
 		    d_cpustates = u_cpustates;
 		    d_memory = u_memory;
 		    d_arc = u_arc;
 		    d_carc = u_carc;
 		    d_swap = u_swap;
 		    d_message = u_message;
 		    d_header = u_header;
 		    d_process = u_process;
 		}
 	    }
 
 	    no_command = true;
 	    if (!interactive)
 	    {
 		usleep(delay * 1e6);
 		if (leaveflag) {
 		    end_screen();
 		    exit(0);
 		}
 	    }
 	    else while (no_command)
 	    {
 		/* assume valid command unless told otherwise */
 		no_command = false;
 
 		/* set up arguments for select with timeout */
 		FD_ZERO(&readfds);
 		FD_SET(0, &readfds);		/* for standard input */
 		timeout.tv_sec  = delay;
 		timeout.tv_usec = 0;
 
 		if (leaveflag) {
 		    end_screen();
 		    exit(0);
 		}
 
 		if (tstopflag) {
 		    /* move to the lower left */
 		    end_screen();
 		    fflush(stdout);
 
 		    /* default the signal handler action */
 		    signal(SIGTSTP, SIG_DFL);
 
 		    /* unblock the signal and send ourselves one */
 		    sigsetmask(sigblock(0) & ~(1 << (SIGTSTP - 1)));
 		    kill(0, SIGTSTP);
 
 		    /* reset the signal handler */
 		    signal(SIGTSTP, tstop);
 
 		    /* reinit screen */
 		    reinit_screen();
 		    reset_display();
 		    tstopflag = 0;
 		    goto restart;
 		}
 
 		if (winchflag) {
 		    /* reascertain the screen dimensions */
 		    get_screensize();
 
 		    /* tell display to resize */
 		    max_topn = display_resize();
 
 		    /* reset the signal handler */
 		    signal(SIGWINCH, top_winch);
 
 		    reset_display();
 		    winchflag = 0;
 		    goto restart;
 		}
 
 		/* wait for either input or the end of the delay period */
 		sel_ret = select(2, &readfds, NULL, NULL, &timeout);
 		if (sel_ret < 0 && errno != EINTR)
 		    quit(0);
 		if (sel_ret > 0)
 		{
 		    int newval;
 		    const char *errmsg;
 			const struct command *cptr;
 
 		    /* something to read -- clear the message area first */
 		    clear_message();
 
 		    /* now read it and convert to command strchr */
 		    /* (use "change" as a temporary to hold strchr) */
 		    if (read(0, &ch, 1) != 1)
 		    {
 			/* read error: either 0 or -1 */
 			new_message(MT_standout, " Read error on stdin");
 			putchar('\r');
 			quit(1);
 		    }
 			if (ch == '\r' || ch == '\n') {
 				continue;
 			}
 			cptr = all_commands;
 			while (cptr->c != '\0') {
 				if (cptr->c == ch) {
 					break;
 				}
 				cptr++;
 			}
 			if (cptr->c == '\0') {
 			    new_message(MT_standout, " Command not understood");
 			    putchar('\r');
 				no_command = true;
 			}
 			if (overstrike && !cptr->available_to_dumb)
 			{
 			    new_message(MT_standout,
 			    " Command cannot be handled by this terminal");
 			    putchar('\r');
 				no_command = true;
 			}
 			if (!no_command) {
 			switch(cptr->id)
 			{
 			    case CMD_redraw:	/* redraw screen */
 				reset_display();
 				break;
 
 			    case CMD_update:	/* merely update display */
 				break;
 
 			    case CMD_quit:
 				quit(0);
 				break;
 
 			    case CMD_help:
 				reset_display();
 				top_clear();
 				show_help();
 				top_standout("Hit any key to continue: ");
 				fflush(stdout);
 				read(0, &ch, 1);
 				break;
 
 			    case CMD_errors:	/* show errors */
 				if (error_count() == 0)
 				{
 				    new_message(MT_standout,
 					" Currently no errors to report.");
 				    putchar('\r');
 				    no_command = true;
 				}
 				else
 				{
 				    reset_display();
 				    top_clear();
 				    show_errors();
 				    top_standout("Hit any key to continue: ");
 				    fflush(stdout);
 				    read(0, &ch, 1);
 				}
 				break;
 
 			    case CMD_number:
 				new_message(MT_standout,
 				    "Number of processes to show: ");
 				newval = readline(tempbuf1, 8, true);
 				if (newval > -1)
 				{
 				    if (newval > max_topn)
 				    {
 					new_message(MT_standout | MT_delayed,
 					  " This terminal can only display %d processes.",
 					  max_topn);
 					putchar('\r');
 				    }
 
 				    if (newval == 0)
 				    {
 					/* inhibit the header */
 					display_header(false);
 				    }
 				    else if (newval > topn && topn == 0)
 				    {
 					/* redraw the header */
 					display_header(true);
 					d_header = i_header;
 				    }
 				    topn = newval;
 				}
 				break;
 
 			    case CMD_delay:	/* new seconds delay */
 				new_message(MT_standout, "Seconds to delay: ");
 				if ((i = readline(tempbuf1, 8, true)) > -1)
 				{
 				    if ((delay = i) == 0)
 				    {
 					delay = 1;
 				    }
 				}
 				clear_message();
 				break;
 
 			    case CMD_displays:	/* change display count */
 				new_message(MT_standout,
 					"Displays to show (currently %s): ",
 					displays == -1 ? "infinite" :
 							 itoa(displays));
 				if ((i = readline(tempbuf1, 10, true)) > 0)
 				{
 				    displays = i;
 				}
 				else if (i == 0)
 				{
 				    quit(0);
 				}
 				clear_message();
 				break;
 
 			    case CMD_kill:	/* kill program */
 				new_message(0, "kill ");
 				if (readline(tempbuf2, sizeof(tempbuf2), false) > 0)
 				{
 				    if ((errmsg = kill_procs(tempbuf2)) != NULL)
 				    {
 					new_message(MT_standout, "%s", errmsg);
 					putchar('\r');
 					no_command = true;
 				    }
 				}
 				else
 				{
 				    clear_message();
 				}
 				break;
 
 			    case CMD_renice:	/* renice program */
 				new_message(0, "renice ");
 				if (readline(tempbuf2, sizeof(tempbuf2), false) > 0)
 				{
 				    if ((errmsg = renice_procs(tempbuf2)) != NULL)
 				    {
 					new_message(MT_standout, "%s", errmsg);
 					putchar('\r');
 					no_command = true;
 				    }
 				}
 				else
 				{
 				    clear_message();
 				}
 				break;
 
 			    case CMD_idletog:
 				ps.idle = !ps.idle;
 				new_message(MT_standout | MT_delayed,
 				    " %sisplaying idle processes.",
 				    ps.idle ? "D" : "Not d");
 				putchar('\r');
 				break;
 
 			    case CMD_selftog:
 				ps.self = !ps.self;
 				new_message(MT_standout | MT_delayed,
 				    " %sisplaying self.",
 				    (ps.self) ? "D" : "Not d");
 				putchar('\r');
 				break;
 
 			    case CMD_user:
 				if (handle_user(tempbuf2, sizeof(tempbuf2)))
 				    no_command = true;
 				break;
 
 			    case CMD_thrtog:
 				ps.thread = !ps.thread;
 				new_message(MT_standout | MT_delayed,
 				    " Displaying threads %s",
 				    ps.thread ? "separately" : "as a count");
 				header_text = format_header(uname_field);
 				reset_display();
 				putchar('\r');
 				break;
 
 			    case CMD_toggletid:
 				ps.thread_id = !ps.thread_id;
 				new_message(MT_standout | MT_delayed,
 				    " Displaying %s",
 				    ps.thread_id ? "tid" : "pid");
 				header_text = format_header(uname_field);
 				reset_display();
 				putchar('\r');
 				break;
 
 			    case CMD_wcputog:
 				ps.wcpu = !ps.wcpu;
 				new_message(MT_standout | MT_delayed,
 				    " Displaying %s CPU",
 				    ps.wcpu ? "weighted" : "raw");
 				header_text = format_header(uname_field);
 				reset_display();
 				putchar('\r');
 				break;
 			    case CMD_viewtog:
 				displaymode = displaymode == DISP_IO ? DISP_CPU : DISP_IO;
 				new_message(MT_standout | MT_delayed,
 				    " Displaying %s statistics.",
 				    displaymode == DISP_IO ? "IO" : "CPU");
 				header_text = format_header(uname_field);
 				display_header(true);
 				d_header = i_header;
 				reset_display();
 				break;
 			    case CMD_viewsys:
 				ps.system = !ps.system;
 				new_message(MT_standout | MT_delayed,
 				    " %sisplaying system processes.",
 				    ps.system ? "D" : "Not d");
 				break;
 			    case CMD_showargs:
 				fmt_flags ^= FMT_SHOWARGS;
 				new_message(MT_standout | MT_delayed,
 				    " %sisplaying process arguments.",
 				    fmt_flags & FMT_SHOWARGS ? "D" : "Not d");
 				break;
 			    case CMD_order:
 				new_message(MT_standout,
 				    "Order to sort: ");
 				if (readline(tempbuf2, sizeof(tempbuf2), false) > 0)
 				{
 				  if ((i = string_index(tempbuf2, statics.order_names)) == -1)
 					{
 					  new_message(MT_standout,
 					      " %s: unrecognized sorting order", tempbuf2);
 					  no_command = true;
 				    }
 				    else
 				    {
 					order_index = i;
 				    }
 				    putchar('\r');
 				}
 				else
 				{
 				    clear_message();
 				}
 				break;
 			    case CMD_jidtog:
 				ps.jail = !ps.jail;
 				new_message(MT_standout | MT_delayed,
 				    " %sisplaying jail ID.",
 				    ps.jail ? "D" : "Not d");
 				header_text = format_header(uname_field);
 				reset_display();
 				putchar('\r');
 				break;
 
 			    case CMD_jail:
 				new_message(MT_standout,
 				    "Jail to show (+ for all): ");
 				if (readline(tempbuf2, sizeof(tempbuf2), false) > 0)
 				{
 				    if (tempbuf2[0] == '+' &&
 					tempbuf2[1] == '\0')
 				    {
 					ps.jid = -1;
 				    }
 				    else if ((i = jail_getid(tempbuf2)) == -1)
 				    {
 					new_message(MT_standout,
 					    " %s: unknown jail", tempbuf2);
 					no_command = true;
 				    }
 				    else
 				    {
 					ps.jid = i;
 				    }
 				    if (ps.jail == 0) {
 					    ps.jail = 1;
 					    new_message(MT_standout |
 						MT_delayed, " Displaying jail "
 						"ID.");
 					    header_text =
 						format_header(uname_field);
 					    reset_display();
 				    }
 				    putchar('\r');
 				}
 				else
 				{
 				    clear_message();
 				}
 				break;
 
 			    case CMD_kidletog:
 				ps.kidle = !ps.kidle;
 				new_message(MT_standout | MT_delayed,
 				    " %sisplaying system idle process.",
 				    ps.kidle ? "D" : "Not d");
 				putchar('\r');
 				break;
 			    case CMD_pcputog:
 				pcpu_stats = !pcpu_stats;
 				new_message(MT_standout | MT_delayed,
 				    " Displaying %sCPU statistics.",
 				    pcpu_stats ? "per-" : "global ");
 				toggle_pcpustats();
 				max_topn = display_updatecpus(&statics);
 				reset_display();
 				putchar('\r');
 				break;
 			    case CMD_swaptog:
 				ps.swap = !ps.swap;
 				new_message(MT_standout | MT_delayed,
 				    " %sisplaying per-process swap usage.",
 				    ps.swap ? "D" : "Not d");
 				header_text = format_header(uname_field);
 				reset_display();
 				putchar('\r');
 				break;
 			    case CMD_pid:
 				new_message(MT_standout,
 					"Process id to show (+ for all): ");
 				if (readline(tempbuf2, sizeof(tempbuf2), false) > 0) {
 					if (tempbuf2[0] == '+' &&
                    			    tempbuf2[1] == '\0') {
 						ps.pid = (pid_t)-1;
 					} else {
 						unsigned long long num;
 						const char *errstr;
 
 						num = strtonum(tempbuf2, 0, INT_MAX,
 							&errstr);
 						if (errstr != NULL || !find_pid(num)) {
 							new_message(MT_standout,
 								" %s: unknown pid",
 								tempbuf2);
 							no_command = true;
 						} else {
 							ps.pid = (pid_t)num;
 						}
 					}
 					putchar('\r');
 				} else
 					clear_message();
 				break;
 			    case CMD_NONE:
 					assert(false && "reached switch without command");
 			}
 			}
 		    }
 
 		    /* flush out stuff that may have been written */
 		    fflush(stdout);
 		}
 	    }
     }
 
 #ifdef DEBUG
     fclose(debug);
 #endif
     quit(0);
 }
 
 /*
  *  reset_display() - reset all the display routine pointers so that entire
  *	screen will get redrawn.
  */
 
 static void
 reset_display(void)
 {
     d_loadave    = i_loadave;
     d_procstates = i_procstates;
     d_cpustates  = i_cpustates;
     d_memory     = i_memory;
     d_arc        = i_arc;
     d_carc       = i_carc;
     d_swap       = i_swap;
     d_message	 = i_message;
     d_header	 = i_header;
     d_process	 = i_process;
 }
 
 /*
  *  signal handlers
  */
 
 static sigret_t
 leave(int i __unused)	/* exit under normal conditions -- INT handler */
 {
 
     leaveflag = 1;
 }
 
 static sigret_t
 tstop(int i __unused)	/* SIGTSTP handler */
 {
 
     tstopflag = 1;
 }
 
 static sigret_t
 top_winch(int i __unused)		/* SIGWINCH handler */
 {
 
     winchflag = 1;
 }
 
 void __dead2
 quit(int status)		/* exit under duress */
 {
     end_screen();
     exit(status);
 }
Index: projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh
===================================================================
--- projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh	(revision 352586)
+++ projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh	(revision 352587)
@@ -1,3361 +1,3370 @@
 #!/bin/sh
 
 #-
 # SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 #
 # Copyright 2004-2007 Colin Percival
 # All rights reserved
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted providing that the following conditions 
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
 # $FreeBSD$
 
 #### Usage function -- called from command-line handling code.
 
 # Usage instructions.  Options not listed:
 # --debug	-- don't filter output from utilities
 # --no-stats	-- don't show progress statistics while fetching files
 usage () {
 	cat <<EOF
 usage: `basename $0` [options] command ... [path]
 
 Options:
   -b basedir   -- Operate on a system mounted at basedir
                   (default: /)
   -d workdir   -- Store working files in workdir
                   (default: /var/db/freebsd-update/)
   -f conffile  -- Read configuration options from conffile
                   (default: /etc/freebsd-update.conf)
   -F           -- Force a fetch operation to proceed in the
                   case of an unfinished upgrade
   -k KEY       -- Trust an RSA key with SHA256 hash of KEY
   -r release   -- Target for upgrade (e.g., 11.1-RELEASE)
   -s server    -- Server from which to fetch updates
                   (default: update.FreeBSD.org)
   -t address   -- Mail output of cron command, if any, to address
                   (default: root)
   --not-running-from-cron
                -- Run without a tty, for use by automated tools
   --currently-running release
                -- Update as if currently running this release
 Commands:
   fetch        -- Fetch updates from server
   cron         -- Sleep rand(3600) seconds, fetch updates, and send an
                   email if updates were found
   upgrade      -- Fetch upgrades to FreeBSD version specified via -r option
   install      -- Install downloaded updates or upgrades
   rollback     -- Uninstall most recently installed updates
   IDS          -- Compare the system against an index of "known good" files
 EOF
 	exit 0
 }
 
 #### Configuration processing functions
 
 #-
 # Configuration options are set in the following order of priority:
 # 1. Command line options
 # 2. Configuration file options
 # 3. Default options
 # In addition, certain options (e.g., IgnorePaths) can be specified multiple
 # times and (as long as these are all in the same place, e.g., inside the
 # configuration file) they will accumulate.  Finally, because the path to the
 # configuration file can be specified at the command line, the entire command
 # line must be processed before we start reading the configuration file.
 #
 # Sound like a mess?  It is.  Here's how we handle this:
 # 1. Initialize CONFFILE and all the options to "".
 # 2. Process the command line.  Throw an error if a non-accumulating option
 #    is specified twice.
 # 3. If CONFFILE is "", set CONFFILE to /etc/freebsd-update.conf .
 # 4. For all the configuration options X, set X_saved to X.
 # 5. Initialize all the options to "".
 # 6. Read CONFFILE line by line, parsing options.
 # 7. For each configuration option X, set X to X_saved iff X_saved is not "".
 # 8. Repeat steps 4-7, except setting options to their default values at (6).
 
 CONFIGOPTIONS="KEYPRINT WORKDIR SERVERNAME MAILTO ALLOWADD ALLOWDELETE
     KEEPMODIFIEDMETADATA COMPONENTS IGNOREPATHS UPDATEIFUNMODIFIED
     BASEDIR VERBOSELEVEL TARGETRELEASE STRICTCOMPONENTS MERGECHANGES
     IDSIGNOREPATHS BACKUPKERNEL BACKUPKERNELDIR BACKUPKERNELSYMBOLFILES"
 
 # Set all the configuration options to "".
 nullconfig () {
 	for X in ${CONFIGOPTIONS}; do
 		eval ${X}=""
 	done
 }
 
 # For each configuration option X, set X_saved to X.
 saveconfig () {
 	for X in ${CONFIGOPTIONS}; do
 		eval ${X}_saved=\$${X}
 	done
 }
 
 # For each configuration option X, set X to X_saved if X_saved is not "".
 mergeconfig () {
 	for X in ${CONFIGOPTIONS}; do
 		eval _=\$${X}_saved
 		if ! [ -z "${_}" ]; then
 			eval ${X}=\$${X}_saved
 		fi
 	done
 }
 
 # Set the trusted keyprint.
 config_KeyPrint () {
 	if [ -z ${KEYPRINT} ]; then
 		KEYPRINT=$1
 	else
 		return 1
 	fi
 }
 
 # Set the working directory.
 config_WorkDir () {
 	if [ -z ${WORKDIR} ]; then
 		WORKDIR=$1
 	else
 		return 1
 	fi
 }
 
 # Set the name of the server (pool) from which to fetch updates
 config_ServerName () {
 	if [ -z ${SERVERNAME} ]; then
 		SERVERNAME=$1
 	else
 		return 1
 	fi
 }
 
 # Set the address to which 'cron' output will be mailed.
 config_MailTo () {
 	if [ -z ${MAILTO} ]; then
 		MAILTO=$1
 	else
 		return 1
 	fi
 }
 
 # Set whether FreeBSD Update is allowed to add files (or directories, or
 # symlinks) which did not previously exist.
 config_AllowAdd () {
 	if [ -z ${ALLOWADD} ]; then
 		case $1 in
 		[Yy][Ee][Ss])
 			ALLOWADD=yes
 			;;
 		[Nn][Oo])
 			ALLOWADD=no
 			;;
 		*)
 			return 1
 			;;
 		esac
 	else
 		return 1
 	fi
 }
 
 # Set whether FreeBSD Update is allowed to remove files/directories/symlinks.
 config_AllowDelete () {
 	if [ -z ${ALLOWDELETE} ]; then
 		case $1 in
 		[Yy][Ee][Ss])
 			ALLOWDELETE=yes
 			;;
 		[Nn][Oo])
 			ALLOWDELETE=no
 			;;
 		*)
 			return 1
 			;;
 		esac
 	else
 		return 1
 	fi
 }
 
 # Set whether FreeBSD Update should keep existing inode ownership,
 # permissions, and flags, in the event that they have been modified locally
 # after the release.
 config_KeepModifiedMetadata () {
 	if [ -z ${KEEPMODIFIEDMETADATA} ]; then
 		case $1 in
 		[Yy][Ee][Ss])
 			KEEPMODIFIEDMETADATA=yes
 			;;
 		[Nn][Oo])
 			KEEPMODIFIEDMETADATA=no
 			;;
 		*)
 			return 1
 			;;
 		esac
 	else
 		return 1
 	fi
 }
 
 # Add to the list of components which should be kept updated.
 config_Components () {
 	for C in $@; do
+		COMPONENTS="${COMPONENTS} ${C}"
+	done
+}
+
+# Remove src component from list if it isn't installed
+finalize_components_config () {
+	COMPONENTS=""
+	for C in $@; do
 		if [ "$C" = "src" ]; then
 			if [ -e "${BASEDIR}/usr/src/COPYRIGHT" ]; then
 				COMPONENTS="${COMPONENTS} ${C}"
 			else
 				echo "src component not installed, skipped"
 			fi
 		else
 			COMPONENTS="${COMPONENTS} ${C}"
 		fi
 	done
 }
 
 # Add to the list of paths under which updates will be ignored.
 config_IgnorePaths () {
 	for C in $@; do
 		IGNOREPATHS="${IGNOREPATHS} ${C}"
 	done
 }
 
 # Add to the list of paths which IDS should ignore.
 config_IDSIgnorePaths () {
 	for C in $@; do
 		IDSIGNOREPATHS="${IDSIGNOREPATHS} ${C}"
 	done
 }
 
 # Add to the list of paths within which updates will be performed only if the
 # file on disk has not been modified locally.
 config_UpdateIfUnmodified () {
 	for C in $@; do
 		UPDATEIFUNMODIFIED="${UPDATEIFUNMODIFIED} ${C}"
 	done
 }
 
 # Add to the list of paths within which updates to text files will be merged
 # instead of overwritten.
 config_MergeChanges () {
 	for C in $@; do
 		MERGECHANGES="${MERGECHANGES} ${C}"
 	done
 }
 
 # Work on a FreeBSD installation mounted under $1
 config_BaseDir () {
 	if [ -z ${BASEDIR} ]; then
 		BASEDIR=$1
 	else
 		return 1
 	fi
 }
 
 # When fetching upgrades, should we assume the user wants exactly the
 # components listed in COMPONENTS, rather than trying to guess based on
 # what's currently installed?
 config_StrictComponents () {
 	if [ -z ${STRICTCOMPONENTS} ]; then
 		case $1 in
 		[Yy][Ee][Ss])
 			STRICTCOMPONENTS=yes
 			;;
 		[Nn][Oo])
 			STRICTCOMPONENTS=no
 			;;
 		*)
 			return 1
 			;;
 		esac
 	else
 		return 1
 	fi
 }
 
 # Upgrade to FreeBSD $1
 config_TargetRelease () {
 	if [ -z ${TARGETRELEASE} ]; then
 		TARGETRELEASE=$1
 	else
 		return 1
 	fi
 	if echo ${TARGETRELEASE} | grep -qE '^[0-9.]+$'; then
 		TARGETRELEASE="${TARGETRELEASE}-RELEASE"
 	fi
 }
 
 # Pretend current release is FreeBSD $1
 config_SourceRelease () {
 	UNAME_r=$1
 	if echo ${UNAME_r} | grep -qE '^[0-9.]+$'; then
 		UNAME_r="${UNAME_r}-RELEASE"
 	fi
 	export UNAME_r
 }
 
 # Define what happens to output of utilities
 config_VerboseLevel () {
 	if [ -z ${VERBOSELEVEL} ]; then
 		case $1 in
 		[Dd][Ee][Bb][Uu][Gg])
 			VERBOSELEVEL=debug
 			;;
 		[Nn][Oo][Ss][Tt][Aa][Tt][Ss])
 			VERBOSELEVEL=nostats
 			;;
 		[Ss][Tt][Aa][Tt][Ss])
 			VERBOSELEVEL=stats
 			;;
 		*)
 			return 1
 			;;
 		esac
 	else
 		return 1
 	fi
 }
 
 config_BackupKernel () {
 	if [ -z ${BACKUPKERNEL} ]; then
 		case $1 in
 		[Yy][Ee][Ss])
 			BACKUPKERNEL=yes
 			;;
 		[Nn][Oo])
 			BACKUPKERNEL=no
 			;;
 		*)
 			return 1
 			;;
 		esac
 	else
 		return 1
 	fi
 }
 
 config_BackupKernelDir () {
 	if [ -z ${BACKUPKERNELDIR} ]; then
 		if [ -z "$1" ]; then
 			echo "BackupKernelDir set to empty dir"
 			return 1
 		fi
 
 		# We check for some paths which would be extremely odd
 		# to use, but which could cause a lot of problems if
 		# used.
 		case $1 in
 		/|/bin|/boot|/etc|/lib|/libexec|/sbin|/usr|/var)
 			echo "BackupKernelDir set to invalid path $1"
 			return 1
 			;;
 		/*)
 			BACKUPKERNELDIR=$1
 			;;
 		*)
 			echo "BackupKernelDir ($1) is not an absolute path"
 			return 1
 			;;
 		esac
 	else
 		return 1
 	fi
 }
 
 config_BackupKernelSymbolFiles () {
 	if [ -z ${BACKUPKERNELSYMBOLFILES} ]; then
 		case $1 in
 		[Yy][Ee][Ss])
 			BACKUPKERNELSYMBOLFILES=yes
 			;;
 		[Nn][Oo])
 			BACKUPKERNELSYMBOLFILES=no
 			;;
 		*)
 			return 1
 			;;
 		esac
 	else
 		return 1
 	fi
 }
 
 # Handle one line of configuration
 configline () {
 	if [ $# -eq 0 ]; then
 		return
 	fi
 
 	OPT=$1
 	shift
 	config_${OPT} $@
 }
 
 #### Parameter handling functions.
 
 # Initialize parameters to null, just in case they're
 # set in the environment.
 init_params () {
 	# Configration settings
 	nullconfig
 
 	# No configuration file set yet
 	CONFFILE=""
 
 	# No commands specified yet
 	COMMANDS=""
 
 	# Force fetch to proceed
 	FORCEFETCH=0
 
 	# Run without a TTY
 	NOTTYOK=0
 
 	# Fetched first in a chain of commands
 	ISFETCHED=0
 }
 
 # Parse the command line
 parse_cmdline () {
 	while [ $# -gt 0 ]; do
 		case "$1" in
 		# Location of configuration file
 		-f)
 			if [ $# -eq 1 ]; then usage; fi
 			if [ ! -z "${CONFFILE}" ]; then usage; fi
 			shift; CONFFILE="$1"
 			;;
 		-F)
 			FORCEFETCH=1
 			;;
 		--not-running-from-cron)
 			NOTTYOK=1
 			;;
 		--currently-running)
 			shift
 			config_SourceRelease $1 || usage
 			;;
 
 		# Configuration file equivalents
 		-b)
 			if [ $# -eq 1 ]; then usage; fi; shift
 			config_BaseDir $1 || usage
 			;;
 		-d)
 			if [ $# -eq 1 ]; then usage; fi; shift
 			config_WorkDir $1 || usage
 			;;
 		-k)
 			if [ $# -eq 1 ]; then usage; fi; shift
 			config_KeyPrint $1 || usage
 			;;
 		-s)
 			if [ $# -eq 1 ]; then usage; fi; shift
 			config_ServerName $1 || usage
 			;;
 		-r)
 			if [ $# -eq 1 ]; then usage; fi; shift
 			config_TargetRelease $1 || usage
 			;;
 		-t)
 			if [ $# -eq 1 ]; then usage; fi; shift
 			config_MailTo $1 || usage
 			;;
 		-v)
 			if [ $# -eq 1 ]; then usage; fi; shift
 			config_VerboseLevel $1 || usage
 			;;
 
 		# Aliases for "-v debug" and "-v nostats"
 		--debug)
 			config_VerboseLevel debug || usage
 			;;
 		--no-stats)
 			config_VerboseLevel nostats || usage
 			;;
 
 		# Commands
 		cron | fetch | upgrade | install | rollback | IDS)
 			COMMANDS="${COMMANDS} $1"
 			;;
 
 		# Anything else is an error
 		*)
 			usage
 			;;
 		esac
 		shift
 	done
 
 	# Make sure we have at least one command
 	if [ -z "${COMMANDS}" ]; then
 		usage
 	fi
 }
 
 # Parse the configuration file
 parse_conffile () {
 	# If a configuration file was specified on the command line, check
 	# that it exists and is readable.
 	if [ ! -z "${CONFFILE}" ] && [ ! -r "${CONFFILE}" ]; then
 		echo -n "File does not exist "
 		echo -n "or is not readable: "
 		echo ${CONFFILE}
 		exit 1
 	fi
 
 	# If a configuration file was not specified on the command line,
 	# use the default configuration file path.  If that default does
 	# not exist, give up looking for any configuration.
 	if [ -z "${CONFFILE}" ]; then
 		CONFFILE="/etc/freebsd-update.conf"
 		if [ ! -r "${CONFFILE}" ]; then
 			return
 		fi
 	fi
 
 	# Save the configuration options specified on the command line, and
 	# clear all the options in preparation for reading the config file.
 	saveconfig
 	nullconfig
 
 	# Read the configuration file.  Anything after the first '#' is
 	# ignored, and any blank lines are ignored.
 	L=0
 	while read LINE; do
 		L=$(($L + 1))
 		LINEX=`echo "${LINE}" | cut -f 1 -d '#'`
 		if ! configline ${LINEX}; then
 			echo "Error processing configuration file, line $L:"
 			echo "==> ${LINE}"
 			exit 1
 		fi
 	done < ${CONFFILE}
 
 	# Merge the settings read from the configuration file with those
 	# provided at the command line.
 	mergeconfig
 }
 
 # Provide some default parameters
 default_params () {
 	# Save any parameters already configured, and clear the slate
 	saveconfig
 	nullconfig
 
 	# Default configurations
 	config_WorkDir /var/db/freebsd-update
 	config_MailTo root
 	config_AllowAdd yes
 	config_AllowDelete yes
 	config_KeepModifiedMetadata yes
 	config_BaseDir /
 	config_VerboseLevel stats
 	config_StrictComponents no
 	config_BackupKernel yes
 	config_BackupKernelDir /boot/kernel.old
 	config_BackupKernelSymbolFiles no
 
 	# Merge these defaults into the earlier-configured settings
 	mergeconfig
 }
 
 # Set utility output filtering options, based on ${VERBOSELEVEL}
 fetch_setup_verboselevel () {
 	case ${VERBOSELEVEL} in
 	debug)
 		QUIETREDIR="/dev/stderr"
 		QUIETFLAG=" "
 		STATSREDIR="/dev/stderr"
 		DDSTATS=".."
 		XARGST="-t"
 		NDEBUG=" "
 		;;
 	nostats)
 		QUIETREDIR=""
 		QUIETFLAG=""
 		STATSREDIR="/dev/null"
 		DDSTATS=".."
 		XARGST=""
 		NDEBUG=""
 		;;
 	stats)
 		QUIETREDIR="/dev/null"
 		QUIETFLAG="-q"
 		STATSREDIR="/dev/stdout"
 		DDSTATS=""
 		XARGST=""
 		NDEBUG="-n"
 		;;
 	esac
 }
 
 # Perform sanity checks and set some final parameters
 # in preparation for fetching files.  Figure out which
 # set of updates should be downloaded: If the user is
 # running *-p[0-9]+, strip off the last part; if the
 # user is running -SECURITY, call it -RELEASE.  Chdir
 # into the working directory.
 fetchupgrade_check_params () {
 	export HTTP_USER_AGENT="freebsd-update (${COMMAND}, `uname -r`)"
 
 	_SERVERNAME_z=\
 "SERVERNAME must be given via command line or configuration file."
 	_KEYPRINT_z="Key must be given via -k option or configuration file."
 	_KEYPRINT_bad="Invalid key fingerprint: "
 	_WORKDIR_bad="Directory does not exist or is not writable: "
 	_WORKDIR_bad2="Directory is not on a persistent filesystem: "
 
 	if [ -z "${SERVERNAME}" ]; then
 		echo -n "`basename $0`: "
 		echo "${_SERVERNAME_z}"
 		exit 1
 	fi
 	if [ -z "${KEYPRINT}" ]; then
 		echo -n "`basename $0`: "
 		echo "${_KEYPRINT_z}"
 		exit 1
 	fi
 	if ! echo "${KEYPRINT}" | grep -qE "^[0-9a-f]{64}$"; then
 		echo -n "`basename $0`: "
 		echo -n "${_KEYPRINT_bad}"
 		echo ${KEYPRINT}
 		exit 1
 	fi
 	if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then
 		echo -n "`basename $0`: "
 		echo -n "${_WORKDIR_bad}"
 		echo ${WORKDIR}
 		exit 1
 	fi
 	case `df -T ${WORKDIR}` in */dev/md[0-9]* | *tmpfs*)
 		echo -n "`basename $0`: "
 		echo -n "${_WORKDIR_bad2}"
 		echo ${WORKDIR}
 		exit 1
 		;;
 	esac
 	chmod 700 ${WORKDIR}
 	cd ${WORKDIR} || exit 1
 
 	# Generate release number.  The s/SECURITY/RELEASE/ bit exists
 	# to provide an upgrade path for FreeBSD Update 1.x users, since
 	# the kernels provided by FreeBSD Update 1.x are always labelled
 	# as X.Y-SECURITY.
 	RELNUM=`uname -r |
 	    sed -E 's,-p[0-9]+,,' |
 	    sed -E 's,-SECURITY,-RELEASE,'`
 	ARCH=`uname -m`
 	FETCHDIR=${RELNUM}/${ARCH}
 	PATCHDIR=${RELNUM}/${ARCH}/bp
 
 	# Disallow upgrade from a version that is not a release
 	case ${RELNUM} in
 	*-RELEASE | *-ALPHA*  | *-BETA* | *-RC*)
 		;;
 	*)
 		echo -n "`basename $0`: "
 		cat <<- EOF
 			Cannot upgrade from a version that is not a release
 			(including alpha, beta and release candidates)
 			using `basename $0`. Instead, FreeBSD can be directly
 			upgraded by source or upgraded to a RELEASE/RELENG version
 			prior to running `basename $0`.
 			Currently running: ${RELNUM}
 		EOF
 		exit 1
 		;;
 	esac
 
 	# Figure out what directory contains the running kernel
 	BOOTFILE=`sysctl -n kern.bootfile`
 	KERNELDIR=${BOOTFILE%/kernel}
 	if ! [ -d ${KERNELDIR} ]; then
 		echo "Cannot identify running kernel"
 		exit 1
 	fi
 
 	# Figure out what kernel configuration is running.  We start with
 	# the output of `uname -i`, and then make the following adjustments:
 	# 1. Replace "SMP-GENERIC" with "SMP".  Why the SMP kernel config
 	# file says "ident SMP-GENERIC", I don't know...
 	# 2. If the kernel claims to be GENERIC _and_ ${ARCH} is "amd64"
 	# _and_ `sysctl kern.version` contains a line which ends "/SMP", then
 	# we're running an SMP kernel.  This mis-identification is a bug
 	# which was fixed in 6.2-STABLE.
 	KERNCONF=`uname -i`
 	if [ ${KERNCONF} = "SMP-GENERIC" ]; then
 		KERNCONF=SMP
 	fi
 	if [ ${KERNCONF} = "GENERIC" ] && [ ${ARCH} = "amd64" ]; then
 		if sysctl kern.version | grep -qE '/SMP$'; then
 			KERNCONF=SMP
 		fi
 	fi
 
 	# Define some paths
 	BSPATCH=/usr/bin/bspatch
 	SHA256=/sbin/sha256
 	PHTTPGET=/usr/libexec/phttpget
 
 	# Set up variables relating to VERBOSELEVEL
 	fetch_setup_verboselevel
 
 	# Construct a unique name from ${BASEDIR}
 	BDHASH=`echo ${BASEDIR} | sha256 -q`
 }
 
 # Perform sanity checks etc. before fetching updates.
 fetch_check_params () {
 	fetchupgrade_check_params
 
 	if ! [ -z "${TARGETRELEASE}" ]; then
 		echo -n "`basename $0`: "
 		echo -n "-r option is meaningless with 'fetch' command.  "
 		echo "(Did you mean 'upgrade' instead?)"
 		exit 1
 	fi
 
 	# Check that we have updates ready to install
 	if [ -f ${BDHASH}-install/kerneldone -a $FORCEFETCH -eq 0 ]; then
 		echo "You have a partially completed upgrade pending"
 		echo "Run '$0 install' first."
 		echo "Run '$0 fetch -F' to proceed anyway."
 		exit 1
 	fi
 }
 
 # Perform sanity checks etc. before fetching upgrades.
 upgrade_check_params () {
 	fetchupgrade_check_params
 
 	# Unless set otherwise, we're upgrading to the same kernel config.
 	NKERNCONF=${KERNCONF}
 
 	# We need TARGETRELEASE set
 	_TARGETRELEASE_z="Release target must be specified via -r option."
 	if [ -z "${TARGETRELEASE}" ]; then
 		echo -n "`basename $0`: "
 		echo "${_TARGETRELEASE_z}"
 		exit 1
 	fi
 
 	# The target release should be != the current release.
 	if [ "${TARGETRELEASE}" = "${RELNUM}" ]; then
 		echo -n "`basename $0`: "
 		echo "Cannot upgrade from ${RELNUM} to itself"
 		exit 1
 	fi
 
 	# Turning off AllowAdd or AllowDelete is a bad idea for upgrades.
 	if [ "${ALLOWADD}" = "no" ]; then
 		echo -n "`basename $0`: "
 		echo -n "WARNING: \"AllowAdd no\" is a bad idea "
 		echo "when upgrading between releases."
 		echo
 	fi
 	if [ "${ALLOWDELETE}" = "no" ]; then
 		echo -n "`basename $0`: "
 		echo -n "WARNING: \"AllowDelete no\" is a bad idea "
 		echo "when upgrading between releases."
 		echo
 	fi
 
 	# Set EDITOR to /usr/bin/vi if it isn't already set
 	: ${EDITOR:='/usr/bin/vi'}
 }
 
 # Perform sanity checks and set some final parameters in
 # preparation for installing updates.
 install_check_params () {
 	# Check that we are root.  All sorts of things won't work otherwise.
 	if [ `id -u` != 0 ]; then
 		echo "You must be root to run this."
 		exit 1
 	fi
 
 	# Check that securelevel <= 0.  Otherwise we can't update schg files.
 	if [ `sysctl -n kern.securelevel` -gt 0 ]; then
 		echo "Updates cannot be installed when the system securelevel"
 		echo "is greater than zero."
 		exit 1
 	fi
 
 	# Check that we have a working directory
 	_WORKDIR_bad="Directory does not exist or is not writable: "
 	if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then
 		echo -n "`basename $0`: "
 		echo -n "${_WORKDIR_bad}"
 		echo ${WORKDIR}
 		exit 1
 	fi
 	cd ${WORKDIR} || exit 1
 
 	# Construct a unique name from ${BASEDIR}
 	BDHASH=`echo ${BASEDIR} | sha256 -q`
 
 	# Check that we have updates ready to install
 	if ! [ -L ${BDHASH}-install ]; then
 		echo "No updates are available to install."
 		if [ $ISFETCHED -eq 0 ]; then
 			echo "Run '$0 fetch' first."
 			exit 1
 		fi
 		exit 0
 	fi
 	if ! [ -f ${BDHASH}-install/INDEX-OLD ] ||
 	    ! [ -f ${BDHASH}-install/INDEX-NEW ]; then
 		echo "Update manifest is corrupt -- this should never happen."
 		echo "Re-run '$0 fetch'."
 		exit 1
 	fi
 
 	# Figure out what directory contains the running kernel
 	BOOTFILE=`sysctl -n kern.bootfile`
 	KERNELDIR=${BOOTFILE%/kernel}
 	if ! [ -d ${KERNELDIR} ]; then
 		echo "Cannot identify running kernel"
 		exit 1
 	fi
 }
 
 # Perform sanity checks and set some final parameters in
 # preparation for UNinstalling updates.
 rollback_check_params () {
 	# Check that we are root.  All sorts of things won't work otherwise.
 	if [ `id -u` != 0 ]; then
 		echo "You must be root to run this."
 		exit 1
 	fi
 
 	# Check that we have a working directory
 	_WORKDIR_bad="Directory does not exist or is not writable: "
 	if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then
 		echo -n "`basename $0`: "
 		echo -n "${_WORKDIR_bad}"
 		echo ${WORKDIR}
 		exit 1
 	fi
 	cd ${WORKDIR} || exit 1
 
 	# Construct a unique name from ${BASEDIR}
 	BDHASH=`echo ${BASEDIR} | sha256 -q`
 
 	# Check that we have updates ready to rollback
 	if ! [ -L ${BDHASH}-rollback ]; then
 		echo "No rollback directory found."
 		exit 1
 	fi
 	if ! [ -f ${BDHASH}-rollback/INDEX-OLD ] ||
 	    ! [ -f ${BDHASH}-rollback/INDEX-NEW ]; then
 		echo "Update manifest is corrupt -- this should never happen."
 		exit 1
 	fi
 }
 
 # Perform sanity checks and set some final parameters
 # in preparation for comparing the system against the
 # published index.  Figure out which index we should
 # compare against: If the user is running *-p[0-9]+,
 # strip off the last part; if the user is running
 # -SECURITY, call it -RELEASE.  Chdir into the working
 # directory.
 IDS_check_params () {
 	export HTTP_USER_AGENT="freebsd-update (${COMMAND}, `uname -r`)"
 
 	_SERVERNAME_z=\
 "SERVERNAME must be given via command line or configuration file."
 	_KEYPRINT_z="Key must be given via -k option or configuration file."
 	_KEYPRINT_bad="Invalid key fingerprint: "
 	_WORKDIR_bad="Directory does not exist or is not writable: "
 
 	if [ -z "${SERVERNAME}" ]; then
 		echo -n "`basename $0`: "
 		echo "${_SERVERNAME_z}"
 		exit 1
 	fi
 	if [ -z "${KEYPRINT}" ]; then
 		echo -n "`basename $0`: "
 		echo "${_KEYPRINT_z}"
 		exit 1
 	fi
 	if ! echo "${KEYPRINT}" | grep -qE "^[0-9a-f]{64}$"; then
 		echo -n "`basename $0`: "
 		echo -n "${_KEYPRINT_bad}"
 		echo ${KEYPRINT}
 		exit 1
 	fi
 	if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then
 		echo -n "`basename $0`: "
 		echo -n "${_WORKDIR_bad}"
 		echo ${WORKDIR}
 		exit 1
 	fi
 	cd ${WORKDIR} || exit 1
 
 	# Generate release number.  The s/SECURITY/RELEASE/ bit exists
 	# to provide an upgrade path for FreeBSD Update 1.x users, since
 	# the kernels provided by FreeBSD Update 1.x are always labelled
 	# as X.Y-SECURITY.
 	RELNUM=`uname -r |
 	    sed -E 's,-p[0-9]+,,' |
 	    sed -E 's,-SECURITY,-RELEASE,'`
 	ARCH=`uname -m`
 	FETCHDIR=${RELNUM}/${ARCH}
 	PATCHDIR=${RELNUM}/${ARCH}/bp
 
 	# Figure out what directory contains the running kernel
 	BOOTFILE=`sysctl -n kern.bootfile`
 	KERNELDIR=${BOOTFILE%/kernel}
 	if ! [ -d ${KERNELDIR} ]; then
 		echo "Cannot identify running kernel"
 		exit 1
 	fi
 
 	# Figure out what kernel configuration is running.  We start with
 	# the output of `uname -i`, and then make the following adjustments:
 	# 1. Replace "SMP-GENERIC" with "SMP".  Why the SMP kernel config
 	# file says "ident SMP-GENERIC", I don't know...
 	# 2. If the kernel claims to be GENERIC _and_ ${ARCH} is "amd64"
 	# _and_ `sysctl kern.version` contains a line which ends "/SMP", then
 	# we're running an SMP kernel.  This mis-identification is a bug
 	# which was fixed in 6.2-STABLE.
 	KERNCONF=`uname -i`
 	if [ ${KERNCONF} = "SMP-GENERIC" ]; then
 		KERNCONF=SMP
 	fi
 	if [ ${KERNCONF} = "GENERIC" ] && [ ${ARCH} = "amd64" ]; then
 		if sysctl kern.version | grep -qE '/SMP$'; then
 			KERNCONF=SMP
 		fi
 	fi
 
 	# Define some paths
 	SHA256=/sbin/sha256
 	PHTTPGET=/usr/libexec/phttpget
 
 	# Set up variables relating to VERBOSELEVEL
 	fetch_setup_verboselevel
 }
 
 #### Core functionality -- the actual work gets done here
 
 # Use an SRV query to pick a server.  If the SRV query doesn't provide
 # a useful answer, use the server name specified by the user.
 # Put another way... look up _http._tcp.${SERVERNAME} and pick a server
 # from that; or if no servers are returned, use ${SERVERNAME}.
 # This allows a user to specify "portsnap.freebsd.org" (in which case
 # portsnap will select one of the mirrors) or "portsnap5.tld.freebsd.org"
 # (in which case portsnap will use that particular server, since there
 # won't be an SRV entry for that name).
 #
 # We ignore the Port field, since we are always going to use port 80.
 
 # Fetch the mirror list, but do not pick a mirror yet.  Returns 1 if
 # no mirrors are available for any reason.
 fetch_pick_server_init () {
 	: > serverlist_tried
 
 # Check that host(1) exists (i.e., that the system wasn't built with the
 # WITHOUT_BIND set) and don't try to find a mirror if it doesn't exist.
 	if ! which -s host; then
 		: > serverlist_full
 		return 1
 	fi
 
 	echo -n "Looking up ${SERVERNAME} mirrors... "
 
 # Issue the SRV query and pull out the Priority, Weight, and Target fields.
 # BIND 9 prints "$name has SRV record ..." while BIND 8 prints
 # "$name server selection ..."; we allow either format.
 	MLIST="_http._tcp.${SERVERNAME}"
 	host -t srv "${MLIST}" |
 	    sed -nE "s/${MLIST} (has SRV record|server selection) //Ip" |
 	    cut -f 1,2,4 -d ' ' |
 	    sed -e 's/\.$//' |
 	    sort > serverlist_full
 
 # If no records, give up -- we'll just use the server name we were given.
 	if [ `wc -l < serverlist_full` -eq 0 ]; then
 		echo "none found."
 		return 1
 	fi
 
 # Report how many mirrors we found.
 	echo `wc -l < serverlist_full` "mirrors found."
 
 # Generate a random seed for use in picking mirrors.  If HTTP_PROXY
 # is set, this will be used to generate the seed; otherwise, the seed
 # will be random.
 	if [ -n "${HTTP_PROXY}${http_proxy}" ]; then
 		RANDVALUE=`sha256 -qs "${HTTP_PROXY}${http_proxy}" |
 		    tr -d 'a-f' |
 		    cut -c 1-9`
 	else
 		RANDVALUE=`jot -r 1 0 999999999`
 	fi
 }
 
 # Pick a mirror.  Returns 1 if we have run out of mirrors to try.
 fetch_pick_server () {
 # Generate a list of not-yet-tried mirrors
 	sort serverlist_tried |
 	    comm -23 serverlist_full - > serverlist
 
 # Have we run out of mirrors?
 	if [ `wc -l < serverlist` -eq 0 ]; then
 		cat <<- EOF
 			No mirrors remaining, giving up.
 
 			This may be because upgrading from this platform (${ARCH})
 			or release (${RELNUM}) is unsupported by `basename $0`. Only
 			platforms with Tier 1 support can be upgraded by `basename $0`.
 			See https://www.freebsd.org/platforms/index.html for more info.
 
 			If unsupported, FreeBSD must be upgraded by source.
 		EOF
 		return 1
 	fi
 
 # Find the highest priority level (lowest numeric value).
 	SRV_PRIORITY=`cut -f 1 -d ' ' serverlist | sort -n | head -1`
 
 # Add up the weights of the response lines at that priority level.
 	SRV_WSUM=0;
 	while read X; do
 		case "$X" in
 		${SRV_PRIORITY}\ *)
 			SRV_W=`echo $X | cut -f 2 -d ' '`
 			SRV_WSUM=$(($SRV_WSUM + $SRV_W))
 			;;
 		esac
 	done < serverlist
 
 # If all the weights are 0, pretend that they are all 1 instead.
 	if [ ${SRV_WSUM} -eq 0 ]; then
 		SRV_WSUM=`grep -E "^${SRV_PRIORITY} " serverlist | wc -l`
 		SRV_W_ADD=1
 	else
 		SRV_W_ADD=0
 	fi
 
 # Pick a value between 0 and the sum of the weights - 1
 	SRV_RND=`expr ${RANDVALUE} % ${SRV_WSUM}`
 
 # Read through the list of mirrors and set SERVERNAME.  Write the line
 # corresponding to the mirror we selected into serverlist_tried so that
 # we won't try it again.
 	while read X; do
 		case "$X" in
 		${SRV_PRIORITY}\ *)
 			SRV_W=`echo $X | cut -f 2 -d ' '`
 			SRV_W=$(($SRV_W + $SRV_W_ADD))
 			if [ $SRV_RND -lt $SRV_W ]; then
 				SERVERNAME=`echo $X | cut -f 3 -d ' '`
 				echo "$X" >> serverlist_tried
 				break
 			else
 				SRV_RND=$(($SRV_RND - $SRV_W))
 			fi
 			;;
 		esac
 	done < serverlist
 }
 
 # Take a list of ${oldhash}|${newhash} and output a list of needed patches,
 # i.e., those for which we have ${oldhash} and don't have ${newhash}.
 fetch_make_patchlist () {
 	grep -vE "^([0-9a-f]{64})\|\1$" |
 	    tr '|' ' ' |
 		while read X Y; do
 			if [ -f "files/${Y}.gz" ] ||
 			    [ ! -f "files/${X}.gz" ]; then
 				continue
 			fi
 			echo "${X}|${Y}"
 		done | sort -u
 }
 
 # Print user-friendly progress statistics
 fetch_progress () {
 	LNC=0
 	while read x; do
 		LNC=$(($LNC + 1))
 		if [ $(($LNC % 10)) = 0 ]; then
 			echo -n $LNC
 		elif [ $(($LNC % 2)) = 0 ]; then
 			echo -n .
 		fi
 	done
 	echo -n " "
 }
 
 # Function for asking the user if everything is ok
 continuep () {
 	while read -p "Does this look reasonable (y/n)? " CONTINUE; do
 		case "${CONTINUE}" in
 		y*)
 			return 0
 			;;
 		n*)
 			return 1
 			;;
 		esac
 	done
 }
 
 # Initialize the working directory
 workdir_init () {
 	mkdir -p files
 	touch tINDEX.present
 }
 
 # Check that we have a public key with an appropriate hash, or
 # fetch the key if it doesn't exist.  Returns 1 if the key has
 # not yet been fetched.
 fetch_key () {
 	if [ -r pub.ssl ] && [ `${SHA256} -q pub.ssl` = ${KEYPRINT} ]; then
 		return 0
 	fi
 
 	echo -n "Fetching public key from ${SERVERNAME}... "
 	rm -f pub.ssl
 	fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/pub.ssl \
 	    2>${QUIETREDIR} || true
 	if ! [ -r pub.ssl ]; then
 		echo "failed."
 		return 1
 	fi
 	if ! [ `${SHA256} -q pub.ssl` = ${KEYPRINT} ]; then
 		echo "key has incorrect hash."
 		rm -f pub.ssl
 		return 1
 	fi
 	echo "done."
 }
 
 # Fetch metadata signature, aka "tag".
 fetch_tag () {
 	echo -n "Fetching metadata signature "
 	echo ${NDEBUG} "for ${RELNUM} from ${SERVERNAME}... "
 	rm -f latest.ssl
 	fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/latest.ssl	\
 	    2>${QUIETREDIR} || true
 	if ! [ -r latest.ssl ]; then
 		echo "failed."
 		return 1
 	fi
 
 	openssl rsautl -pubin -inkey pub.ssl -verify		\
 	    < latest.ssl > tag.new 2>${QUIETREDIR} || true
 	rm latest.ssl
 
 	if ! [ `wc -l < tag.new` = 1 ] ||
 	    ! grep -qE	\
     "^freebsd-update\|${ARCH}\|${RELNUM}\|[0-9]+\|[0-9a-f]{64}\|[0-9]{10}" \
 		tag.new; then
 		echo "invalid signature."
 		return 1
 	fi
 
 	echo "done."
 
 	RELPATCHNUM=`cut -f 4 -d '|' < tag.new`
 	TINDEXHASH=`cut -f 5 -d '|' < tag.new`
 	EOLTIME=`cut -f 6 -d '|' < tag.new`
 }
 
 # Sanity-check the patch number in a tag, to make sure that we're not
 # going to "update" backwards and to prevent replay attacks.
 fetch_tagsanity () {
 	# Check that we're not going to move from -pX to -pY with Y < X.
 	RELPX=`uname -r | sed -E 's,.*-,,'`
 	if echo ${RELPX} | grep -qE '^p[0-9]+$'; then
 		RELPX=`echo ${RELPX} | cut -c 2-`
 	else
 		RELPX=0
 	fi
 	if [ "${RELPATCHNUM}" -lt "${RELPX}" ]; then
 		echo
 		echo -n "Files on mirror (${RELNUM}-p${RELPATCHNUM})"
 		echo " appear older than what"
 		echo "we are currently running (`uname -r`)!"
 		echo "Cowardly refusing to proceed any further."
 		return 1
 	fi
 
 	# If "tag" exists and corresponds to ${RELNUM}, make sure that
 	# it contains a patch number <= RELPATCHNUM, in order to protect
 	# against rollback (replay) attacks.
 	if [ -f tag ] &&
 	    grep -qE	\
     "^freebsd-update\|${ARCH}\|${RELNUM}\|[0-9]+\|[0-9a-f]{64}\|[0-9]{10}" \
 		tag; then
 		LASTRELPATCHNUM=`cut -f 4 -d '|' < tag`
 
 		if [ "${RELPATCHNUM}" -lt "${LASTRELPATCHNUM}" ]; then
 			echo
 			echo -n "Files on mirror (${RELNUM}-p${RELPATCHNUM})"
 			echo " are older than the"
 			echo -n "most recently seen updates"
 			echo " (${RELNUM}-p${LASTRELPATCHNUM})."
 			echo "Cowardly refusing to proceed any further."
 			return 1
 		fi
 	fi
 }
 
 # Fetch metadata index file
 fetch_metadata_index () {
 	echo ${NDEBUG} "Fetching metadata index... "
 	rm -f ${TINDEXHASH}
 	fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/t/${TINDEXHASH}
 	    2>${QUIETREDIR}
 	if ! [ -f ${TINDEXHASH} ]; then
 		echo "failed."
 		return 1
 	fi
 	if [ `${SHA256} -q ${TINDEXHASH}` != ${TINDEXHASH} ]; then
 		echo "update metadata index corrupt."
 		return 1
 	fi
 	echo "done."
 }
 
 # Print an error message about signed metadata being bogus.
 fetch_metadata_bogus () {
 	echo
 	echo "The update metadata$1 is correctly signed, but"
 	echo "failed an integrity check."
 	echo "Cowardly refusing to proceed any further."
 	return 1
 }
 
 # Construct tINDEX.new by merging the lines named in $1 from ${TINDEXHASH}
 # with the lines not named in $@ from tINDEX.present (if that file exists).
 fetch_metadata_index_merge () {
 	for METAFILE in $@; do
 		if [ `grep -E "^${METAFILE}\|" ${TINDEXHASH} | wc -l`	\
 		    -ne 1 ]; then
 			fetch_metadata_bogus " index"
 			return 1
 		fi
 
 		grep -E "${METAFILE}\|" ${TINDEXHASH}
 	done |
 	    sort > tINDEX.wanted
 
 	if [ -f tINDEX.present ]; then
 		join -t '|' -v 2 tINDEX.wanted tINDEX.present |
 		    sort -m - tINDEX.wanted > tINDEX.new
 		rm tINDEX.wanted
 	else
 		mv tINDEX.wanted tINDEX.new
 	fi
 }
 
 # Sanity check all the lines of tINDEX.new.  Even if more metadata lines
 # are added by future versions of the server, this won't cause problems,
 # since the only lines which appear in tINDEX.new are the ones which we
 # specifically grepped out of ${TINDEXHASH}.
 fetch_metadata_index_sanity () {
 	if grep -qvE '^[0-9A-Z.-]+\|[0-9a-f]{64}$' tINDEX.new; then
 		fetch_metadata_bogus " index"
 		return 1
 	fi
 }
 
 # Sanity check the metadata file $1.
 fetch_metadata_sanity () {
 	# Some aliases to save space later: ${P} is a character which can
 	# appear in a path; ${M} is the four numeric metadata fields; and
 	# ${H} is a sha256 hash.
 	P="[-+./:=,%@_[~[:alnum:]]"
 	M="[0-9]+\|[0-9]+\|[0-9]+\|[0-9]+"
 	H="[0-9a-f]{64}"
 
 	# Check that the first four fields make sense.
 	if gunzip -c < files/$1.gz |
 	    grep -qvE "^[a-z]+\|[0-9a-z-]+\|${P}+\|[fdL-]\|"; then
 		fetch_metadata_bogus ""
 		return 1
 	fi
 
 	# Remove the first three fields.
 	gunzip -c < files/$1.gz |
 	    cut -f 4- -d '|' > sanitycheck.tmp
 
 	# Sanity check entries with type 'f'
 	if grep -E '^f' sanitycheck.tmp |
 	    grep -qvE "^f\|${M}\|${H}\|${P}*\$"; then
 		fetch_metadata_bogus ""
 		return 1
 	fi
 
 	# Sanity check entries with type 'd'
 	if grep -E '^d' sanitycheck.tmp |
 	    grep -qvE "^d\|${M}\|\|\$"; then
 		fetch_metadata_bogus ""
 		return 1
 	fi
 
 	# Sanity check entries with type 'L'
 	if grep -E '^L' sanitycheck.tmp |
 	    grep -qvE "^L\|${M}\|${P}*\|\$"; then
 		fetch_metadata_bogus ""
 		return 1
 	fi
 
 	# Sanity check entries with type '-'
 	if grep -E '^-' sanitycheck.tmp |
 	    grep -qvE "^-\|\|\|\|\|\|"; then
 		fetch_metadata_bogus ""
 		return 1
 	fi
 
 	# Clean up
 	rm sanitycheck.tmp
 }
 
 # Fetch the metadata index and metadata files listed in $@,
 # taking advantage of metadata patches where possible.
 fetch_metadata () {
 	fetch_metadata_index || return 1
 	fetch_metadata_index_merge $@ || return 1
 	fetch_metadata_index_sanity || return 1
 
 	# Generate a list of wanted metadata patches
 	join -t '|' -o 1.2,2.2 tINDEX.present tINDEX.new |
 	    fetch_make_patchlist > patchlist
 
 	if [ -s patchlist ]; then
 		# Attempt to fetch metadata patches
 		echo -n "Fetching `wc -l < patchlist | tr -d ' '` "
 		echo ${NDEBUG} "metadata patches.${DDSTATS}"
 		tr '|' '-' < patchlist |
 		    lam -s "${FETCHDIR}/tp/" - -s ".gz" |
 		    xargs ${XARGST} ${PHTTPGET} ${SERVERNAME}	\
 			2>${STATSREDIR} | fetch_progress
 		echo "done."
 
 		# Attempt to apply metadata patches
 		echo -n "Applying metadata patches... "
 		tr '|' ' ' < patchlist |
 		    while read X Y; do
 			if [ ! -f "${X}-${Y}.gz" ]; then continue; fi
 			gunzip -c < ${X}-${Y}.gz > diff
 			gunzip -c < files/${X}.gz > diff-OLD
 
 			# Figure out which lines are being added and removed
 			grep -E '^-' diff |
 			    cut -c 2- |
 			    while read PREFIX; do
 				look "${PREFIX}" diff-OLD
 			    done |
 			    sort > diff-rm
 			grep -E '^\+' diff |
 			    cut -c 2- > diff-add
 
 			# Generate the new file
 			comm -23 diff-OLD diff-rm |
 			    sort - diff-add > diff-NEW
 
 			if [ `${SHA256} -q diff-NEW` = ${Y} ]; then
 				mv diff-NEW files/${Y}
 				gzip -n files/${Y}
 			else
 				mv diff-NEW ${Y}.bad
 			fi
 			rm -f ${X}-${Y}.gz diff
 			rm -f diff-OLD diff-NEW diff-add diff-rm
 		done 2>${QUIETREDIR}
 		echo "done."
 	fi
 
 	# Update metadata without patches
 	cut -f 2 -d '|' < tINDEX.new |
 	    while read Y; do
 		if [ ! -f "files/${Y}.gz" ]; then
 			echo ${Y};
 		fi
 	    done |
 	    sort -u > filelist
 
 	if [ -s filelist ]; then
 		echo -n "Fetching `wc -l < filelist | tr -d ' '` "
 		echo ${NDEBUG} "metadata files... "
 		lam -s "${FETCHDIR}/m/" - -s ".gz" < filelist |
 		    xargs ${XARGST} ${PHTTPGET} ${SERVERNAME}	\
 		    2>${QUIETREDIR}
 
 		while read Y; do
 			if ! [ -f ${Y}.gz ]; then
 				echo "failed."
 				return 1
 			fi
 			if [ `gunzip -c < ${Y}.gz |
 			    ${SHA256} -q` = ${Y} ]; then
 				mv ${Y}.gz files/${Y}.gz
 			else
 				echo "metadata is corrupt."
 				return 1
 			fi
 		done < filelist
 		echo "done."
 	fi
 
 # Sanity-check the metadata files.
 	cut -f 2 -d '|' tINDEX.new > filelist
 	while read X; do
 		fetch_metadata_sanity ${X} || return 1
 	done < filelist
 
 # Remove files which are no longer needed
 	cut -f 2 -d '|' tINDEX.present |
 	    sort > oldfiles
 	cut -f 2 -d '|' tINDEX.new |
 	    sort |
 	    comm -13 - oldfiles |
 	    lam -s "files/" - -s ".gz" |
 	    xargs rm -f
 	rm patchlist filelist oldfiles
 	rm ${TINDEXHASH}
 
 # We're done!
 	mv tINDEX.new tINDEX.present
 	mv tag.new tag
 
 	return 0
 }
 
 # Extract a subset of a downloaded metadata file containing only the parts
 # which are listed in COMPONENTS.
 fetch_filter_metadata_components () {
 	METAHASH=`look "$1|" tINDEX.present | cut -f 2 -d '|'`
 	gunzip -c < files/${METAHASH}.gz > $1.all
 
 	# Fish out the lines belonging to components we care about.
 	for C in ${COMPONENTS}; do
 		look "`echo ${C} | tr '/' '|'`|" $1.all
 	done > $1
 
 	# Remove temporary file.
 	rm $1.all
 }
 
 # Generate a filtered version of the metadata file $1 from the downloaded
 # file, by fishing out the lines corresponding to components we're trying
 # to keep updated, and then removing lines corresponding to paths we want
 # to ignore.
 fetch_filter_metadata () {
 	# Fish out the lines belonging to components we care about.
 	fetch_filter_metadata_components $1
 
 	# Canonicalize directory names by removing any trailing / in
 	# order to avoid listing directories multiple times if they
 	# belong to multiple components.  Turning "/" into "" doesn't
 	# matter, since we add a leading "/" when we use paths later.
 	cut -f 3- -d '|' $1 |
 	    sed -e 's,/|d|,|d|,' |
 	    sed -e 's,/|-|,|-|,' |
 	    sort -u > $1.tmp
 
 	# Figure out which lines to ignore and remove them.
 	for X in ${IGNOREPATHS}; do
 		grep -E "^${X}" $1.tmp
 	done |
 	    sort -u |
 	    comm -13 - $1.tmp > $1
 
 	# Remove temporary files.
 	rm $1.tmp
 }
 
 # Filter the metadata file $1 by adding lines with "/boot/$2"
 # replaced by ${KERNELDIR} (which is `sysctl -n kern.bootfile` minus the
 # trailing "/kernel"); and if "/boot/$2" does not exist, remove
 # the original lines which start with that.
 # Put another way: Deal with the fact that the FOO kernel is sometimes
 # installed in /boot/FOO/ and is sometimes installed elsewhere.
 fetch_filter_kernel_names () {
 	grep ^/boot/$2 $1 |
 	    sed -e "s,/boot/$2,${KERNELDIR},g" |
 	    sort - $1 > $1.tmp
 	mv $1.tmp $1
 
 	if ! [ -d /boot/$2 ]; then
 		grep -v ^/boot/$2 $1 > $1.tmp
 		mv $1.tmp $1
 	fi
 }
 
 # For all paths appearing in $1 or $3, inspect the system
 # and generate $2 describing what is currently installed.
 fetch_inspect_system () {
 	# No errors yet...
 	rm -f .err
 
 	# Tell the user why his disk is suddenly making lots of noise
 	echo -n "Inspecting system... "
 
 	# Generate list of files to inspect
 	cat $1 $3 |
 	    cut -f 1 -d '|' |
 	    sort -u > filelist
 
 	# Examine each file and output lines of the form
 	# /path/to/file|type|device-inum|user|group|perm|flags|value
 	# sorted by device and inode number.
 	while read F; do
 		# If the symlink/file/directory does not exist, record this.
 		if ! [ -e ${BASEDIR}/${F} ]; then
 			echo "${F}|-||||||"
 			continue
 		fi
 		if ! [ -r ${BASEDIR}/${F} ]; then
 			echo "Cannot read file: ${BASEDIR}/${F}"	\
 			    >/dev/stderr
 			touch .err
 			return 1
 		fi
 
 		# Otherwise, output an index line.
 		if [ -L ${BASEDIR}/${F} ]; then
 			echo -n "${F}|L|"
 			stat -n -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F};
 			readlink ${BASEDIR}/${F};
 		elif [ -f ${BASEDIR}/${F} ]; then
 			echo -n "${F}|f|"
 			stat -n -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F};
 			sha256 -q ${BASEDIR}/${F};
 		elif [ -d ${BASEDIR}/${F} ]; then
 			echo -n "${F}|d|"
 			stat -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F};
 		else
 			echo "Unknown file type: ${BASEDIR}/${F}"	\
 			    >/dev/stderr
 			touch .err
 			return 1
 		fi
 	done < filelist |
 	    sort -k 3,3 -t '|' > $2.tmp
 	rm filelist
 
 	# Check if an error occurred during system inspection
 	if [ -f .err ]; then
 		return 1
 	fi
 
 	# Convert to the form
 	# /path/to/file|type|user|group|perm|flags|value|hlink
 	# by resolving identical device and inode numbers into hard links.
 	cut -f 1,3 -d '|' $2.tmp |
 	    sort -k 1,1 -t '|' |
 	    sort -s -u -k 2,2 -t '|' |
 	    join -1 2 -2 3 -t '|' - $2.tmp |
 	    awk -F \| -v OFS=\|		\
 		'{
 		    if (($2 == $3) || ($4 == "-"))
 			print $3,$4,$5,$6,$7,$8,$9,""
 		    else
 			print $3,$4,$5,$6,$7,$8,$9,$2
 		}' |
 	    sort > $2
 	rm $2.tmp
 
 	# We're finished looking around
 	echo "done."
 }
 
 # For any paths matching ${MERGECHANGES}, compare $1 and $2 and find any
 # files which differ; generate $3 containing these paths and the old hashes.
 fetch_filter_mergechanges () {
 	# Pull out the paths and hashes of the files matching ${MERGECHANGES}.
 	for F in $1 $2; do
 		for X in ${MERGECHANGES}; do
 			grep -E "^${X}" ${F}
 		done |
 		    cut -f 1,2,7 -d '|' |
 		    sort > ${F}-values
 	done
 
 	# Any line in $2-values which doesn't appear in $1-values and is a
 	# file means that we should list the path in $3.
 	comm -13 $1-values $2-values |
 	    fgrep '|f|' |
 	    cut -f 1 -d '|' > $2-paths
 
 	# For each path, pull out one (and only one!) entry from $1-values.
 	# Note that we cannot distinguish which "old" version the user made
 	# changes to; but hopefully any changes which occur due to security
 	# updates will exist in both the "new" version and the version which
 	# the user has installed, so the merging will still work.
 	while read X; do
 		look "${X}|" $1-values |
 		    head -1
 	done < $2-paths > $3
 
 	# Clean up
 	rm $1-values $2-values $2-paths
 }
 
 # For any paths matching ${UPDATEIFUNMODIFIED}, remove lines from $[123]
 # which correspond to lines in $2 with hashes not matching $1 or $3, unless
 # the paths are listed in $4.  For entries in $2 marked "not present"
 # (aka. type -), remove lines from $[123] unless there is a corresponding
 # entry in $1.
 fetch_filter_unmodified_notpresent () {
 	# Figure out which lines of $1 and $3 correspond to bits which
 	# should only be updated if they haven't changed, and fish out
 	# the (path, type, value) tuples.
 	# NOTE: We don't consider a file to be "modified" if it matches
 	# the hash from $3.
 	for X in ${UPDATEIFUNMODIFIED}; do
 		grep -E "^${X}" $1
 		grep -E "^${X}" $3
 	done |
 	    cut -f 1,2,7 -d '|' |
 	    sort > $1-values
 
 	# Do the same for $2.
 	for X in ${UPDATEIFUNMODIFIED}; do
 		grep -E "^${X}" $2
 	done |
 	    cut -f 1,2,7 -d '|' |
 	    sort > $2-values
 
 	# Any entry in $2-values which is not in $1-values corresponds to
 	# a path which we need to remove from $1, $2, and $3, unless it
 	# that path appears in $4.
 	comm -13 $1-values $2-values |
 	    sort -t '|' -k 1,1 > mlines.tmp
 	cut -f 1 -d '|' $4 |
 	    sort |
 	    join -v 2 -t '|' - mlines.tmp |
 	    sort > mlines
 	rm $1-values $2-values mlines.tmp
 
 	# Any lines in $2 which are not in $1 AND are "not present" lines
 	# also belong in mlines.
 	comm -13 $1 $2 |
 	    cut -f 1,2,7 -d '|' |
 	    fgrep '|-|' >> mlines
 
 	# Remove lines from $1, $2, and $3
 	for X in $1 $2 $3; do
 		sort -t '|' -k 1,1 ${X} > ${X}.tmp
 		cut -f 1 -d '|' < mlines |
 		    sort |
 		    join -v 2 -t '|' - ${X}.tmp |
 		    sort > ${X}
 		rm ${X}.tmp
 	done
 
 	# Store a list of the modified files, for future reference
 	fgrep -v '|-|' mlines |
 	    cut -f 1 -d '|' > modifiedfiles
 	rm mlines
 }
 
 # For each entry in $1 of type -, remove any corresponding
 # entry from $2 if ${ALLOWADD} != "yes".  Remove all entries
 # of type - from $1.
 fetch_filter_allowadd () {
 	cut -f 1,2 -d '|' < $1 |
 	    fgrep '|-' |
 	    cut -f 1 -d '|' > filesnotpresent
 
 	if [ ${ALLOWADD} != "yes" ]; then
 		sort < $2 |
 		    join -v 1 -t '|' - filesnotpresent |
 		    sort > $2.tmp
 		mv $2.tmp $2
 	fi
 
 	sort < $1 |
 	    join -v 1 -t '|' - filesnotpresent |
 	    sort > $1.tmp
 	mv $1.tmp $1
 	rm filesnotpresent
 }
 
 # If ${ALLOWDELETE} != "yes", then remove any entries from $1
 # which don't correspond to entries in $2.
 fetch_filter_allowdelete () {
 	# Produce a lists ${PATH}|${TYPE}
 	for X in $1 $2; do
 		cut -f 1-2 -d '|' < ${X} |
 		    sort -u > ${X}.nodes
 	done
 
 	# Figure out which lines need to be removed from $1.
 	if [ ${ALLOWDELETE} != "yes" ]; then
 		comm -23 $1.nodes $2.nodes > $1.badnodes
 	else
 		: > $1.badnodes
 	fi
 
 	# Remove the relevant lines from $1
 	while read X; do
 		look "${X}|" $1
 	done < $1.badnodes |
 	    comm -13 - $1 > $1.tmp
 	mv $1.tmp $1
 
 	rm $1.badnodes $1.nodes $2.nodes
 }
 
 # If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in $2
 # with metadata not matching any entry in $1, replace the corresponding
 # line of $3 with one having the same metadata as the entry in $2.
 fetch_filter_modified_metadata () {
 	# Fish out the metadata from $1 and $2
 	for X in $1 $2; do
 		cut -f 1-6 -d '|' < ${X} > ${X}.metadata
 	done
 
 	# Find the metadata we need to keep
 	if [ ${KEEPMODIFIEDMETADATA} = "yes" ]; then
 		comm -13 $1.metadata $2.metadata > keepmeta
 	else
 		: > keepmeta
 	fi
 
 	# Extract the lines which we need to remove from $3, and
 	# construct the lines which we need to add to $3.
 	: > $3.remove
 	: > $3.add
 	while read LINE; do
 		NODE=`echo "${LINE}" | cut -f 1-2 -d '|'`
 		look "${NODE}|" $3 >> $3.remove
 		look "${NODE}|" $3 |
 		    cut -f 7- -d '|' |
 		    lam -s "${LINE}|" - >> $3.add
 	done < keepmeta
 
 	# Remove the specified lines and add the new lines.
 	sort $3.remove |
 	    comm -13 - $3 |
 	    sort -u - $3.add > $3.tmp
 	mv $3.tmp $3
 
 	rm keepmeta $1.metadata $2.metadata $3.add $3.remove
 }
 
 # Remove lines from $1 and $2 which are identical;
 # no need to update a file if it isn't changing.
 fetch_filter_uptodate () {
 	comm -23 $1 $2 > $1.tmp
 	comm -13 $1 $2 > $2.tmp
 
 	mv $1.tmp $1
 	mv $2.tmp $2
 }
 
 # Fetch any "clean" old versions of files we need for merging changes.
 fetch_files_premerge () {
 	# We only need to do anything if $1 is non-empty.
 	if [ -s $1 ]; then
 		# Tell the user what we're doing
 		echo -n "Fetching files from ${OLDRELNUM} for merging... "
 
 		# List of files wanted
 		fgrep '|f|' < $1 |
 		    cut -f 3 -d '|' |
 		    sort -u > files.wanted
 
 		# Only fetch the files we don't already have
 		while read Y; do
 			if [ ! -f "files/${Y}.gz" ]; then
 				echo ${Y};
 			fi
 		done < files.wanted > filelist
 
 		# Actually fetch them
 		lam -s "${OLDFETCHDIR}/f/" - -s ".gz" < filelist |
 		    xargs ${XARGST} ${PHTTPGET} ${SERVERNAME}	\
 		    2>${QUIETREDIR}
 
 		# Make sure we got them all, and move them into /files/
 		while read Y; do
 			if ! [ -f ${Y}.gz ]; then
 				echo "failed."
 				return 1
 			fi
 			if [ `gunzip -c < ${Y}.gz |
 			    ${SHA256} -q` = ${Y} ]; then
 				mv ${Y}.gz files/${Y}.gz
 			else
 				echo "${Y} has incorrect hash."
 				return 1
 			fi
 		done < filelist
 		echo "done."
 
 		# Clean up
 		rm filelist files.wanted
 	fi
 }
 
 # Prepare to fetch files: Generate a list of the files we need,
 # copy the unmodified files we have into /files/, and generate
 # a list of patches to download.
 fetch_files_prepare () {
 	# Tell the user why his disk is suddenly making lots of noise
 	echo -n "Preparing to download files... "
 
 	# Reduce indices to ${PATH}|${HASH} pairs
 	for X in $1 $2 $3; do
 		cut -f 1,2,7 -d '|' < ${X} |
 		    fgrep '|f|' |
 		    cut -f 1,3 -d '|' |
 		    sort > ${X}.hashes
 	done
 
 	# List of files wanted
 	cut -f 2 -d '|' < $3.hashes |
 	    sort -u |
 	    while read HASH; do
 		if ! [ -f files/${HASH}.gz ]; then
 			echo ${HASH}
 		fi
 	done > files.wanted
 
 	# Generate a list of unmodified files
 	comm -12 $1.hashes $2.hashes |
 	    sort -k 1,1 -t '|' > unmodified.files
 
 	# Copy all files into /files/.  We only need the unmodified files
 	# for use in patching; but we'll want all of them if the user asks
 	# to rollback the updates later.
 	while read LINE; do
 		F=`echo "${LINE}" | cut -f 1 -d '|'`
 		HASH=`echo "${LINE}" | cut -f 2 -d '|'`
 
 		# Skip files we already have.
 		if [ -f files/${HASH}.gz ]; then
 			continue
 		fi
 
 		# Make sure the file hasn't changed.
 		cp "${BASEDIR}/${F}" tmpfile
 		if [ `sha256 -q tmpfile` != ${HASH} ]; then
 			echo
 			echo "File changed while FreeBSD Update running: ${F}"
 			return 1
 		fi
 
 		# Place the file into storage.
 		gzip -c < tmpfile > files/${HASH}.gz
 		rm tmpfile
 	done < $2.hashes
 
 	# Produce a list of patches to download
 	sort -k 1,1 -t '|' $3.hashes |
 	    join -t '|' -o 2.2,1.2 - unmodified.files |
 	    fetch_make_patchlist > patchlist
 
 	# Garbage collect
 	rm unmodified.files $1.hashes $2.hashes $3.hashes
 
 	# We don't need the list of possible old files any more.
 	rm $1
 
 	# We're finished making noise
 	echo "done."
 }
 
 # Fetch files.
 fetch_files () {
 	# Attempt to fetch patches
 	if [ -s patchlist ]; then
 		echo -n "Fetching `wc -l < patchlist | tr -d ' '` "
 		echo ${NDEBUG} "patches.${DDSTATS}"
 		tr '|' '-' < patchlist |
 		    lam -s "${PATCHDIR}/" - |
 		    xargs ${XARGST} ${PHTTPGET} ${SERVERNAME}	\
 			2>${STATSREDIR} | fetch_progress
 		echo "done."
 
 		# Attempt to apply patches
 		echo -n "Applying patches... "
 		tr '|' ' ' < patchlist |
 		    while read X Y; do
 			if [ ! -f "${X}-${Y}" ]; then continue; fi
 			gunzip -c < files/${X}.gz > OLD
 
 			bspatch OLD NEW ${X}-${Y}
 
 			if [ `${SHA256} -q NEW` = ${Y} ]; then
 				mv NEW files/${Y}
 				gzip -n files/${Y}
 			fi
 			rm -f diff OLD NEW ${X}-${Y}
 		done 2>${QUIETREDIR}
 		echo "done."
 	fi
 
 	# Download files which couldn't be generate via patching
 	while read Y; do
 		if [ ! -f "files/${Y}.gz" ]; then
 			echo ${Y};
 		fi
 	done < files.wanted > filelist
 
 	if [ -s filelist ]; then
 		echo -n "Fetching `wc -l < filelist | tr -d ' '` "
 		echo ${NDEBUG} "files... "
 		lam -s "${FETCHDIR}/f/" - -s ".gz" < filelist |
 		    xargs ${XARGST} ${PHTTPGET} ${SERVERNAME}	\
 			2>${STATSREDIR} | fetch_progress
 
 		while read Y; do
 			if ! [ -f ${Y}.gz ]; then
 				echo "failed."
 				return 1
 			fi
 			if [ `gunzip -c < ${Y}.gz |
 			    ${SHA256} -q` = ${Y} ]; then
 				mv ${Y}.gz files/${Y}.gz
 			else
 				echo "${Y} has incorrect hash."
 				return 1
 			fi
 		done < filelist
 		echo "done."
 	fi
 
 	# Clean up
 	rm files.wanted filelist patchlist
 }
 
 # Create and populate install manifest directory; and report what updates
 # are available.
 fetch_create_manifest () {
 	# If we have an existing install manifest, nuke it.
 	if [ -L "${BDHASH}-install" ]; then
 		rm -r ${BDHASH}-install/
 		rm ${BDHASH}-install
 	fi
 
 	# Report to the user if any updates were avoided due to local changes
 	if [ -s modifiedfiles ]; then
 		cat - modifiedfiles <<- EOF | ${PAGER}
 			The following files are affected by updates. No changes have
 			been downloaded, however, because the files have been modified
 			locally:
 		EOF
 	fi
 	rm modifiedfiles
 
 	# If no files will be updated, tell the user and exit
 	if ! [ -s INDEX-PRESENT ] &&
 	    ! [ -s INDEX-NEW ]; then
 		rm INDEX-PRESENT INDEX-NEW
 		echo
 		echo -n "No updates needed to update system to "
 		echo "${RELNUM}-p${RELPATCHNUM}."
 		return
 	fi
 
 	# Divide files into (a) removed files, (b) added files, and
 	# (c) updated files.
 	cut -f 1 -d '|' < INDEX-PRESENT |
 	    sort > INDEX-PRESENT.flist
 	cut -f 1 -d '|' < INDEX-NEW |
 	    sort > INDEX-NEW.flist
 	comm -23 INDEX-PRESENT.flist INDEX-NEW.flist > files.removed
 	comm -13 INDEX-PRESENT.flist INDEX-NEW.flist > files.added
 	comm -12 INDEX-PRESENT.flist INDEX-NEW.flist > files.updated
 	rm INDEX-PRESENT.flist INDEX-NEW.flist
 
 	# Report removed files, if any
 	if [ -s files.removed ]; then
 		cat - files.removed <<- EOF | ${PAGER}
 			The following files will be removed as part of updating to
 			${RELNUM}-p${RELPATCHNUM}:
 		EOF
 	fi
 	rm files.removed
 
 	# Report added files, if any
 	if [ -s files.added ]; then
 		cat - files.added <<- EOF | ${PAGER}
 			The following files will be added as part of updating to
 			${RELNUM}-p${RELPATCHNUM}:
 		EOF
 	fi
 	rm files.added
 
 	# Report updated files, if any
 	if [ -s files.updated ]; then
 		cat - files.updated <<- EOF | ${PAGER}
 			The following files will be updated as part of updating to
 			${RELNUM}-p${RELPATCHNUM}:
 		EOF
 	fi
 	rm files.updated
 
 	# Create a directory for the install manifest.
 	MDIR=`mktemp -d install.XXXXXX` || return 1
 
 	# Populate it
 	mv INDEX-PRESENT ${MDIR}/INDEX-OLD
 	mv INDEX-NEW ${MDIR}/INDEX-NEW
 
 	# Link it into place
 	ln -s ${MDIR} ${BDHASH}-install
 }
 
 # Warn about any upcoming EoL
 fetch_warn_eol () {
 	# What's the current time?
 	NOWTIME=`date "+%s"`
 
 	# When did we last warn about the EoL date?
 	if [ -f lasteolwarn ]; then
 		LASTWARN=`cat lasteolwarn`
 	else
 		LASTWARN=`expr ${NOWTIME} - 63072000`
 	fi
 
 	# If the EoL time is past, warn.
 	if [ ${EOLTIME} -lt ${NOWTIME} ]; then
 		echo
 		cat <<-EOF
 		WARNING: `uname -sr` HAS PASSED ITS END-OF-LIFE DATE.
 		Any security issues discovered after `date -r ${EOLTIME}`
 		will not have been corrected.
 		EOF
 		return 1
 	fi
 
 	# Figure out how long it has been since we last warned about the
 	# upcoming EoL, and how much longer we have left.
 	SINCEWARN=`expr ${NOWTIME} - ${LASTWARN}`
 	TIMELEFT=`expr ${EOLTIME} - ${NOWTIME}`
 
 	# Don't warn if the EoL is more than 3 months away
 	if [ ${TIMELEFT} -gt 7884000 ]; then
 		return 0
 	fi
 
 	# Don't warn if the time remaining is more than 3 times the time
 	# since the last warning.
 	if [ ${TIMELEFT} -gt `expr ${SINCEWARN} \* 3` ]; then
 		return 0
 	fi
 
 	# Figure out what time units to use.
 	if [ ${TIMELEFT} -lt 604800 ]; then
 		UNIT="day"
 		SIZE=86400
 	elif [ ${TIMELEFT} -lt 2678400 ]; then
 		UNIT="week"
 		SIZE=604800
 	else
 		UNIT="month"
 		SIZE=2678400
 	fi
 
 	# Compute the right number of units
 	NUM=`expr ${TIMELEFT} / ${SIZE}`
 	if [ ${NUM} != 1 ]; then
 		UNIT="${UNIT}s"
 	fi
 
 	# Print the warning
 	echo
 	cat <<-EOF
 		WARNING: `uname -sr` is approaching its End-of-Life date.
 		It is strongly recommended that you upgrade to a newer
 		release within the next ${NUM} ${UNIT}.
 	EOF
 
 	# Update the stored time of last warning
 	echo ${NOWTIME} > lasteolwarn
 }
 
 # Do the actual work involved in "fetch" / "cron".
 fetch_run () {
 	workdir_init || return 1
 
 	# Prepare the mirror list.
 	fetch_pick_server_init && fetch_pick_server
 
 	# Try to fetch the public key until we run out of servers.
 	while ! fetch_key; do
 		fetch_pick_server || return 1
 	done
 
 	# Try to fetch the metadata index signature ("tag") until we run
 	# out of available servers; and sanity check the downloaded tag.
 	while ! fetch_tag; do
 		fetch_pick_server || return 1
 	done
 	fetch_tagsanity || return 1
 
 	# Fetch the latest INDEX-NEW and INDEX-OLD files.
 	fetch_metadata INDEX-NEW INDEX-OLD || return 1
 
 	# Generate filtered INDEX-NEW and INDEX-OLD files containing only
 	# the lines which (a) belong to components we care about, and (b)
 	# don't correspond to paths we're explicitly ignoring.
 	fetch_filter_metadata INDEX-NEW || return 1
 	fetch_filter_metadata INDEX-OLD || return 1
 
 	# Translate /boot/${KERNCONF} into ${KERNELDIR}
 	fetch_filter_kernel_names INDEX-NEW ${KERNCONF}
 	fetch_filter_kernel_names INDEX-OLD ${KERNCONF}
 
 	# For all paths appearing in INDEX-OLD or INDEX-NEW, inspect the
 	# system and generate an INDEX-PRESENT file.
 	fetch_inspect_system INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1
 
 	# Based on ${UPDATEIFUNMODIFIED}, remove lines from INDEX-* which
 	# correspond to lines in INDEX-PRESENT with hashes not appearing
 	# in INDEX-OLD or INDEX-NEW.  Also remove lines where the entry in
 	# INDEX-PRESENT has type - and there isn't a corresponding entry in
 	# INDEX-OLD with type -.
 	fetch_filter_unmodified_notpresent	\
 	    INDEX-OLD INDEX-PRESENT INDEX-NEW /dev/null
 
 	# For each entry in INDEX-PRESENT of type -, remove any corresponding
 	# entry from INDEX-NEW if ${ALLOWADD} != "yes".  Remove all entries
 	# of type - from INDEX-PRESENT.
 	fetch_filter_allowadd INDEX-PRESENT INDEX-NEW
 
 	# If ${ALLOWDELETE} != "yes", then remove any entries from
 	# INDEX-PRESENT which don't correspond to entries in INDEX-NEW.
 	fetch_filter_allowdelete INDEX-PRESENT INDEX-NEW
 
 	# If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in
 	# INDEX-PRESENT with metadata not matching any entry in INDEX-OLD,
 	# replace the corresponding line of INDEX-NEW with one having the
 	# same metadata as the entry in INDEX-PRESENT.
 	fetch_filter_modified_metadata INDEX-OLD INDEX-PRESENT INDEX-NEW
 
 	# Remove lines from INDEX-PRESENT and INDEX-NEW which are identical;
 	# no need to update a file if it isn't changing.
 	fetch_filter_uptodate INDEX-PRESENT INDEX-NEW
 
 	# Prepare to fetch files: Generate a list of the files we need,
 	# copy the unmodified files we have into /files/, and generate
 	# a list of patches to download.
 	fetch_files_prepare INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1
 
 	# Fetch files.
 	fetch_files || return 1
 
 	# Create and populate install manifest directory; and report what
 	# updates are available.
 	fetch_create_manifest || return 1
 
 	# Warn about any upcoming EoL
 	fetch_warn_eol || return 1
 }
 
 # If StrictComponents is not "yes", generate a new components list
 # with only the components which appear to be installed.
 upgrade_guess_components () {
 	if [ "${STRICTCOMPONENTS}" = "no" ]; then
 		# Generate filtered INDEX-ALL with only the components listed
 		# in COMPONENTS.
 		fetch_filter_metadata_components $1 || return 1
 
 		# Tell the user why his disk is suddenly making lots of noise
 		echo -n "Inspecting system... "
 
 		# Look at the files on disk, and assume that a component is
 		# supposed to be present if it is more than half-present.
 		cut -f 1-3 -d '|' < INDEX-ALL |
 		    tr '|' ' ' |
 		    while read C S F; do
 			if [ -e ${BASEDIR}/${F} ]; then
 				echo "+ ${C}|${S}"
 			fi
 			echo "= ${C}|${S}"
 		    done |
 		    sort |
 		    uniq -c |
 		    sed -E 's,^ +,,' > compfreq
 		grep ' = ' compfreq |
 		    cut -f 1,3 -d ' ' |
 		    sort -k 2,2 -t ' ' > compfreq.total
 		grep ' + ' compfreq |
 		    cut -f 1,3 -d ' ' |
 		    sort -k 2,2 -t ' ' > compfreq.present
 		join -t ' ' -1 2 -2 2 compfreq.present compfreq.total |
 		    while read S P T; do
 			if [ ${T} -ne 0 -a ${P} -gt `expr ${T} / 2` ]; then
 				echo ${S}
 			fi
 		    done > comp.present
 		cut -f 2 -d ' ' < compfreq.total > comp.total
 		rm INDEX-ALL compfreq compfreq.total compfreq.present
 
 		# We're done making noise.
 		echo "done."
 
 		# Sometimes the kernel isn't installed where INDEX-ALL
 		# thinks that it should be: In particular, it is often in
 		# /boot/kernel instead of /boot/GENERIC or /boot/SMP.  To
 		# deal with this, if "kernel|X" is listed in comp.total
 		# (i.e., is a component which would be upgraded if it is
 		# found to be present) we will add it to comp.present.
 		# If "kernel|<anything>" is in comp.total but "kernel|X" is
 		# not, we print a warning -- the user is running a kernel
 		# which isn't part of the release.
 		KCOMP=`echo ${KERNCONF} | tr 'A-Z' 'a-z'`
 		grep -E "^kernel\|${KCOMP}\$" comp.total >> comp.present
 
 		if grep -qE "^kernel\|" comp.total &&
 		    ! grep -qE "^kernel\|${KCOMP}\$" comp.total; then
 			cat <<-EOF
 
 WARNING: This system is running a "${KCOMP}" kernel, which is not a
 kernel configuration distributed as part of FreeBSD ${RELNUM}.
 This kernel will not be updated: you MUST update the kernel manually
 before running "$0 install".
 			EOF
 		fi
 
 		# Re-sort the list of installed components and generate
 		# the list of non-installed components.
 		sort -u < comp.present > comp.present.tmp
 		mv comp.present.tmp comp.present
 		comm -13 comp.present comp.total > comp.absent
 
 		# Ask the user to confirm that what we have is correct.  To
 		# reduce user confusion, translate "X|Y" back to "X/Y" (as
 		# subcomponents must be listed in the configuration file).
 		echo
 		echo -n "The following components of FreeBSD "
 		echo "seem to be installed:"
 		tr '|' '/' < comp.present |
 		    fmt -72
 		echo
 		echo -n "The following components of FreeBSD "
 		echo "do not seem to be installed:"
 		tr '|' '/' < comp.absent |
 		    fmt -72
 		echo
 		continuep || return 1
 		echo
 
 		# Suck the generated list of components into ${COMPONENTS}.
 		# Note that comp.present.tmp is used due to issues with
 		# pipelines and setting variables.
 		COMPONENTS=""
 		tr '|' '/' < comp.present > comp.present.tmp
 		while read C; do
 			COMPONENTS="${COMPONENTS} ${C}"
 		done < comp.present.tmp
 
 		# Delete temporary files
 		rm comp.present comp.present.tmp comp.absent comp.total
 	fi
 }
 
 # If StrictComponents is not "yes", COMPONENTS contains an entry
 # corresponding to the currently running kernel, and said kernel
 # does not exist in the new release, add "kernel/generic" to the
 # list of components.
 upgrade_guess_new_kernel () {
 	if [ "${STRICTCOMPONENTS}" = "no" ]; then
 		# Grab the unfiltered metadata file.
 		METAHASH=`look "$1|" tINDEX.present | cut -f 2 -d '|'`
 		gunzip -c < files/${METAHASH}.gz > $1.all
 
 		# If "kernel/${KCOMP}" is in ${COMPONENTS} and that component
 		# isn't in $1.all, we need to add kernel/generic.
 		for C in ${COMPONENTS}; do
 			if [ ${C} = "kernel/${KCOMP}" ] &&
 			    ! grep -qE "^kernel\|${KCOMP}\|" $1.all; then
 				COMPONENTS="${COMPONENTS} kernel/generic"
 				NKERNCONF="GENERIC"
 				cat <<-EOF
 
 WARNING: This system is running a "${KCOMP}" kernel, which is not a
 kernel configuration distributed as part of FreeBSD ${RELNUM}.
 As part of upgrading to FreeBSD ${RELNUM}, this kernel will be
 replaced with a "generic" kernel.
 				EOF
 				continuep || return 1
 			fi
 		done
 
 		# Don't need this any more...
 		rm $1.all
 	fi
 }
 
 # Convert INDEX-OLD (last release) and INDEX-ALL (new release) into
 # INDEX-OLD and INDEX-NEW files (in the sense of normal upgrades).
 upgrade_oldall_to_oldnew () {
 	# For each ${F}|... which appears in INDEX-ALL but does not appear
 	# in INDEX-OLD, add ${F}|-|||||| to INDEX-OLD.
 	cut -f 1 -d '|' < $1 |
 	    sort -u > $1.paths
 	cut -f 1 -d '|' < $2 |
 	    sort -u |
 	    comm -13 $1.paths - |
 	    lam - -s "|-||||||" |
 	    sort - $1 > $1.tmp
 	mv $1.tmp $1
 
 	# Remove lines from INDEX-OLD which also appear in INDEX-ALL
 	comm -23 $1 $2 > $1.tmp
 	mv $1.tmp $1
 
 	# Remove lines from INDEX-ALL which have a file name not appearing
 	# anywhere in INDEX-OLD (since these must be files which haven't
 	# changed -- if they were new, there would be an entry of type "-").
 	cut -f 1 -d '|' < $1 |
 	    sort -u > $1.paths
 	sort -k 1,1 -t '|' < $2 |
 	    join -t '|' - $1.paths |
 	    sort > $2.tmp
 	rm $1.paths
 	mv $2.tmp $2
 
 	# Rename INDEX-ALL to INDEX-NEW.
 	mv $2 $3
 }
 
 # Helper for upgrade_merge: Return zero true iff the two files differ only
 # in the contents of their RCS tags.
 samef () {
 	X=`sed -E 's/\\$FreeBSD.*\\$/\$FreeBSD\$/' < $1 | ${SHA256}`
 	Y=`sed -E 's/\\$FreeBSD.*\\$/\$FreeBSD\$/' < $2 | ${SHA256}`
 
 	if [ $X = $Y ]; then
 		return 0;
 	else
 		return 1;
 	fi
 }
 
 # From the list of "old" files in $1, merge changes in $2 with those in $3,
 # and update $3 to reflect the hashes of merged files.
 upgrade_merge () {
 	# We only need to do anything if $1 is non-empty.
 	if [ -s $1 ]; then
 		cut -f 1 -d '|' $1 |
 		    sort > $1-paths
 
 		# Create staging area for merging files
 		rm -rf merge/
 		while read F; do
 			D=`dirname ${F}`
 			mkdir -p merge/old/${D}
 			mkdir -p merge/${OLDRELNUM}/${D}
 			mkdir -p merge/${RELNUM}/${D}
 			mkdir -p merge/new/${D}
 		done < $1-paths
 
 		# Copy in files
 		while read F; do
 			# Currently installed file
 			V=`look "${F}|" $2 | cut -f 7 -d '|'`
 			gunzip < files/${V}.gz > merge/old/${F}
 
 			# Old release
 			if look "${F}|" $1 | fgrep -q "|f|"; then
 				V=`look "${F}|" $1 | cut -f 3 -d '|'`
 				gunzip < files/${V}.gz		\
 				    > merge/${OLDRELNUM}/${F}
 			fi
 
 			# New release
 			if look "${F}|" $3 | cut -f 1,2,7 -d '|' |
 			    fgrep -q "|f|"; then
 				V=`look "${F}|" $3 | cut -f 7 -d '|'`
 				gunzip < files/${V}.gz		\
 				    > merge/${RELNUM}/${F}
 			fi
 		done < $1-paths
 
 		# Attempt to automatically merge changes
 		echo -n "Attempting to automatically merge "
 		echo -n "changes in files..."
 		: > failed.merges
 		while read F; do
 			# If the file doesn't exist in the new release,
 			# the result of "merging changes" is having the file
 			# not exist.
 			if ! [ -f merge/${RELNUM}/${F} ]; then
 				continue
 			fi
 
 			# If the file didn't exist in the old release, we're
 			# going to throw away the existing file and hope that
 			# the version from the new release is what we want.
 			if ! [ -f merge/${OLDRELNUM}/${F} ]; then
 				cp merge/${RELNUM}/${F} merge/new/${F}
 				continue
 			fi
 
 			# Some files need special treatment.
 			case ${F} in
 			/etc/spwd.db | /etc/pwd.db | /etc/login.conf.db)
 				# Don't merge these -- we're rebuild them
 				# after updates are installed.
 				cp merge/old/${F} merge/new/${F}
 				;;
 			*)
 				if ! diff3 -E -m -L "current version"	\
 				    -L "${OLDRELNUM}" -L "${RELNUM}"	\
 				    merge/old/${F}			\
 				    merge/${OLDRELNUM}/${F}		\
 				    merge/${RELNUM}/${F}		\
 				    > merge/new/${F} 2>/dev/null; then
 					echo ${F} >> failed.merges
 				fi
 				;;
 			esac
 		done < $1-paths
 		echo " done."
 
 		# Ask the user to handle any files which didn't merge.
 		while read F; do
 			# If the installed file differs from the version in
 			# the old release only due to RCS tag expansion
 			# then just use the version in the new release.
 			if samef merge/old/${F} merge/${OLDRELNUM}/${F}; then
 				cp merge/${RELNUM}/${F} merge/new/${F}
 				continue
 			fi
 
 			cat <<-EOF
 
 The following file could not be merged automatically: ${F}
 Press Enter to edit this file in ${EDITOR} and resolve the conflicts
 manually...
 			EOF
 			read dummy </dev/tty
 			${EDITOR} `pwd`/merge/new/${F} < /dev/tty
 		done < failed.merges
 		rm failed.merges
 
 		# Ask the user to confirm that he likes how the result
 		# of merging files.
 		while read F; do
 			# Skip files which haven't changed except possibly
 			# in their RCS tags.
 			if [ -f merge/old/${F} ] && [ -f merge/new/${F} ] &&
 			    samef merge/old/${F} merge/new/${F}; then
 				continue
 			fi
 
 			# Skip files where the installed file differs from
 			# the old file only due to RCS tags.
 			if [ -f merge/old/${F} ] &&
 			    [ -f merge/${OLDRELNUM}/${F} ] &&
 			    samef merge/old/${F} merge/${OLDRELNUM}/${F}; then
 				continue
 			fi
 
 			# Warn about files which are ceasing to exist.
 			if ! [ -f merge/new/${F} ]; then
 				cat <<-EOF
 
 The following file will be removed, as it no longer exists in
 FreeBSD ${RELNUM}: ${F}
 				EOF
 				continuep < /dev/tty || return 1
 				continue
 			fi
 
 			# Print changes for the user's approval.
 			cat <<-EOF
 
 The following changes, which occurred between FreeBSD ${OLDRELNUM} and
 FreeBSD ${RELNUM} have been merged into ${F}:
 EOF
 			diff -U 5 -L "current version" -L "new version"	\
 			    merge/old/${F} merge/new/${F} || true
 			continuep < /dev/tty || return 1
 		done < $1-paths
 
 		# Store merged files.
 		while read F; do
 			if [ -f merge/new/${F} ]; then
 				V=`${SHA256} -q merge/new/${F}`
 
 				gzip -c < merge/new/${F} > files/${V}.gz
 				echo "${F}|${V}"
 			fi
 		done < $1-paths > newhashes
 
 		# Pull lines out from $3 which need to be updated to
 		# reflect merged files.
 		while read F; do
 			look "${F}|" $3
 		done < $1-paths > $3-oldlines
 
 		# Update lines to reflect merged files
 		join -t '|' -o 1.1,1.2,1.3,1.4,1.5,1.6,2.2,1.8		\
 		    $3-oldlines newhashes > $3-newlines
 
 		# Remove old lines from $3 and add new lines.
 		sort $3-oldlines |
 		    comm -13 - $3 |
 		    sort - $3-newlines > $3.tmp
 		mv $3.tmp $3
 
 		# Clean up
 		rm $1-paths newhashes $3-oldlines $3-newlines
 		rm -rf merge/
 	fi
 
 	# We're done with merging files.
 	rm $1
 }
 
 # Do the work involved in fetching upgrades to a new release
 upgrade_run () {
 	workdir_init || return 1
 
 	# Prepare the mirror list.
 	fetch_pick_server_init && fetch_pick_server
 
 	# Try to fetch the public key until we run out of servers.
 	while ! fetch_key; do
 		fetch_pick_server || return 1
 	done
  
 	# Try to fetch the metadata index signature ("tag") until we run
 	# out of available servers; and sanity check the downloaded tag.
 	while ! fetch_tag; do
 		fetch_pick_server || return 1
 	done
 	fetch_tagsanity || return 1
 
 	# Fetch the INDEX-OLD and INDEX-ALL.
 	fetch_metadata INDEX-OLD INDEX-ALL || return 1
 
 	# If StrictComponents is not "yes", generate a new components list
 	# with only the components which appear to be installed.
 	upgrade_guess_components INDEX-ALL || return 1
 
 	# Generate filtered INDEX-OLD and INDEX-ALL files containing only
 	# the components we want and without anything marked as "Ignore".
 	fetch_filter_metadata INDEX-OLD || return 1
 	fetch_filter_metadata INDEX-ALL || return 1
 
 	# Merge the INDEX-OLD and INDEX-ALL files into INDEX-OLD.
 	sort INDEX-OLD INDEX-ALL > INDEX-OLD.tmp
 	mv INDEX-OLD.tmp INDEX-OLD
 	rm INDEX-ALL
 
 	# Adjust variables for fetching files from the new release.
 	OLDRELNUM=${RELNUM}
 	RELNUM=${TARGETRELEASE}
 	OLDFETCHDIR=${FETCHDIR}
 	FETCHDIR=${RELNUM}/${ARCH}
 
 	# Try to fetch the NEW metadata index signature ("tag") until we run
 	# out of available servers; and sanity check the downloaded tag.
 	while ! fetch_tag; do
 		fetch_pick_server || return 1
 	done
 
 	# Fetch the new INDEX-ALL.
 	fetch_metadata INDEX-ALL || return 1
 
 	# If StrictComponents is not "yes", COMPONENTS contains an entry
 	# corresponding to the currently running kernel, and said kernel
 	# does not exist in the new release, add "kernel/generic" to the
 	# list of components.
 	upgrade_guess_new_kernel INDEX-ALL || return 1
 
 	# Filter INDEX-ALL to contain only the components we want and without
 	# anything marked as "Ignore".
 	fetch_filter_metadata INDEX-ALL || return 1
 
 	# Convert INDEX-OLD (last release) and INDEX-ALL (new release) into
 	# INDEX-OLD and INDEX-NEW files (in the sense of normal upgrades).
 	upgrade_oldall_to_oldnew INDEX-OLD INDEX-ALL INDEX-NEW
 
 	# Translate /boot/${KERNCONF} or /boot/${NKERNCONF} into ${KERNELDIR}
 	fetch_filter_kernel_names INDEX-NEW ${NKERNCONF}
 	fetch_filter_kernel_names INDEX-OLD ${KERNCONF}
 
 	# For all paths appearing in INDEX-OLD or INDEX-NEW, inspect the
 	# system and generate an INDEX-PRESENT file.
 	fetch_inspect_system INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1
 
 	# Based on ${MERGECHANGES}, generate a file tomerge-old with the
 	# paths and hashes of old versions of files to merge.
 	fetch_filter_mergechanges INDEX-OLD INDEX-PRESENT tomerge-old
 
 	# Based on ${UPDATEIFUNMODIFIED}, remove lines from INDEX-* which
 	# correspond to lines in INDEX-PRESENT with hashes not appearing
 	# in INDEX-OLD or INDEX-NEW.  Also remove lines where the entry in
 	# INDEX-PRESENT has type - and there isn't a corresponding entry in
 	# INDEX-OLD with type -.
 	fetch_filter_unmodified_notpresent	\
 	    INDEX-OLD INDEX-PRESENT INDEX-NEW tomerge-old
 
 	# For each entry in INDEX-PRESENT of type -, remove any corresponding
 	# entry from INDEX-NEW if ${ALLOWADD} != "yes".  Remove all entries
 	# of type - from INDEX-PRESENT.
 	fetch_filter_allowadd INDEX-PRESENT INDEX-NEW
 
 	# If ${ALLOWDELETE} != "yes", then remove any entries from
 	# INDEX-PRESENT which don't correspond to entries in INDEX-NEW.
 	fetch_filter_allowdelete INDEX-PRESENT INDEX-NEW
 
 	# If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in
 	# INDEX-PRESENT with metadata not matching any entry in INDEX-OLD,
 	# replace the corresponding line of INDEX-NEW with one having the
 	# same metadata as the entry in INDEX-PRESENT.
 	fetch_filter_modified_metadata INDEX-OLD INDEX-PRESENT INDEX-NEW
 
 	# Remove lines from INDEX-PRESENT and INDEX-NEW which are identical;
 	# no need to update a file if it isn't changing.
 	fetch_filter_uptodate INDEX-PRESENT INDEX-NEW
 
 	# Fetch "clean" files from the old release for merging changes.
 	fetch_files_premerge tomerge-old
 
 	# Prepare to fetch files: Generate a list of the files we need,
 	# copy the unmodified files we have into /files/, and generate
 	# a list of patches to download.
 	fetch_files_prepare INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1
 
 	# Fetch patches from to-${RELNUM}/${ARCH}/bp/
 	PATCHDIR=to-${RELNUM}/${ARCH}/bp
 	fetch_files || return 1
 
 	# Merge configuration file changes.
 	upgrade_merge tomerge-old INDEX-PRESENT INDEX-NEW || return 1
 
 	# Create and populate install manifest directory; and report what
 	# updates are available.
 	fetch_create_manifest || return 1
 
 	# Leave a note behind to tell the "install" command that the kernel
 	# needs to be installed before the world.
 	touch ${BDHASH}-install/kernelfirst
 
 	# Remind the user that they need to run "freebsd-update install"
 	# to install the downloaded bits, in case they didn't RTFM.
 	echo "To install the downloaded upgrades, run \"$0 install\"."
 }
 
 # Make sure that all the file hashes mentioned in $@ have corresponding
 # gzipped files stored in /files/.
 install_verify () {
 	# Generate a list of hashes
 	cat $@ |
 	    cut -f 2,7 -d '|' |
 	    grep -E '^f' |
 	    cut -f 2 -d '|' |
 	    sort -u > filelist
 
 	# Make sure all the hashes exist
 	while read HASH; do
 		if ! [ -f files/${HASH}.gz ]; then
 			echo -n "Update files missing -- "
 			echo "this should never happen."
 			echo "Re-run '$0 fetch'."
 			return 1
 		fi
 	done < filelist
 
 	# Clean up
 	rm filelist
 }
 
 # Remove the system immutable flag from files
 install_unschg () {
 	# Generate file list
 	cat $@ |
 	    cut -f 1 -d '|' > filelist
 
 	# Remove flags
 	while read F; do
 		if ! [ -e ${BASEDIR}/${F} ]; then
 			continue
 		else
 			echo ${BASEDIR}/${F}
 		fi
 	done < filelist | xargs chflags noschg || return 1
 
 	# Clean up
 	rm filelist
 }
 
 # Decide which directory name to use for kernel backups.
 backup_kernel_finddir () {
 	CNT=0
 	while true ; do
 		# Pathname does not exist, so it is OK use that name
 		# for backup directory.
 		if [ ! -e $BASEDIR/$BACKUPKERNELDIR ]; then
 			return 0
 		fi
 
 		# If directory do exist, we only use if it has our
 		# marker file.
 		if [ -d $BASEDIR/$BACKUPKERNELDIR -a \
 			-e $BASEDIR/$BACKUPKERNELDIR/.freebsd-update ]; then
 			return 0
 		fi
 
 		# We could not use current directory name, so add counter to
 		# the end and try again.
 		CNT=$((CNT + 1))
 		if [ $CNT -gt 9 ]; then
 			echo "Could not find valid backup dir ($BASEDIR/$BACKUPKERNELDIR)"
 			exit 1
 		fi
 		BACKUPKERNELDIR="`echo $BACKUPKERNELDIR | sed -Ee 's/[0-9]\$//'`"
 		BACKUPKERNELDIR="${BACKUPKERNELDIR}${CNT}"
 	done
 }
 
 # Backup the current kernel using hardlinks, if not disabled by user.
 # Since we delete all files in the directory used for previous backups
 # we create a marker file called ".freebsd-update" in the directory so
 # we can determine on the next run that the directory was created by
 # freebsd-update and we then do not accidentally remove user files in
 # the unlikely case that the user has created a directory with a
 # conflicting name.
 backup_kernel () {
 	# Only make kernel backup is so configured.
 	if [ $BACKUPKERNEL != yes ]; then
 		return 0
 	fi
 
 	# Decide which directory name to use for kernel backups.
 	backup_kernel_finddir
 
 	# Remove old kernel backup files.  If $BACKUPKERNELDIR was
 	# "not ours", backup_kernel_finddir would have exited, so
 	# deleting the directory content is as safe as we can make it.
 	if [ -d $BASEDIR/$BACKUPKERNELDIR ]; then
 		rm -fr $BASEDIR/$BACKUPKERNELDIR
 	fi
 
 	# Create directories for backup.
 	mkdir -p $BASEDIR/$BACKUPKERNELDIR
 	mtree -cdn -p "${BASEDIR}/${KERNELDIR}" | \
 	    mtree -Ue -p "${BASEDIR}/${BACKUPKERNELDIR}" > /dev/null
 
 	# Mark the directory as having been created by freebsd-update.
 	touch $BASEDIR/$BACKUPKERNELDIR/.freebsd-update
 	if [ $? -ne 0 ]; then
 		echo "Could not create kernel backup directory"
 		exit 1
 	fi
 
 	# Disable pathname expansion to be sure *.symbols is not
 	# expanded.
 	set -f
 
 	# Use find to ignore symbol files, unless disabled by user.
 	if [ $BACKUPKERNELSYMBOLFILES = yes ]; then
 		FINDFILTER=""
 	else
 		FINDFILTER="-a ! -name *.debug -a ! -name *.symbols"
 	fi
 
 	# Backup all the kernel files using hardlinks.
 	(cd ${BASEDIR}/${KERNELDIR} && find . -type f $FINDFILTER -exec \
 	    cp -pl '{}' ${BASEDIR}/${BACKUPKERNELDIR}/'{}' \;)
 
 	# Re-enable patchname expansion.
 	set +f
 }
 
 # Install new files
 install_from_index () {
 	# First pass: Do everything apart from setting file flags.  We
 	# can't set flags yet, because schg inhibits hard linking.
 	sort -k 1,1 -t '|' $1 |
 	    tr '|' ' ' |
 	    while read FPATH TYPE OWNER GROUP PERM FLAGS HASH LINK; do
 		case ${TYPE} in
 		d)
 			# Create a directory
 			install -d -o ${OWNER} -g ${GROUP}		\
 			    -m ${PERM} ${BASEDIR}/${FPATH}
 			;;
 		f)
 			if [ -z "${LINK}" ]; then
 				# Create a file, without setting flags.
 				gunzip < files/${HASH}.gz > ${HASH}
 				install -S -o ${OWNER} -g ${GROUP}	\
 				    -m ${PERM} ${HASH} ${BASEDIR}/${FPATH}
 				rm ${HASH}
 			else
 				# Create a hard link.
 				ln -f ${BASEDIR}/${LINK} ${BASEDIR}/${FPATH}
 			fi
 			;;
 		L)
 			# Create a symlink
 			ln -sfh ${HASH} ${BASEDIR}/${FPATH}
 			;;
 		esac
 	    done
 
 	# Perform a second pass, adding file flags.
 	tr '|' ' ' < $1 |
 	    while read FPATH TYPE OWNER GROUP PERM FLAGS HASH LINK; do
 		if [ ${TYPE} = "f" ] &&
 		    ! [ ${FLAGS} = "0" ]; then
 			chflags ${FLAGS} ${BASEDIR}/${FPATH}
 		fi
 	    done
 }
 
 # Remove files which we want to delete
 install_delete () {
 	# Generate list of new files
 	cut -f 1 -d '|' < $2 |
 	    sort > newfiles
 
 	# Generate subindex of old files we want to nuke
 	sort -k 1,1 -t '|' $1 |
 	    join -t '|' -v 1 - newfiles |
 	    sort -r -k 1,1 -t '|' |
 	    cut -f 1,2 -d '|' |
 	    tr '|' ' ' > killfiles
 
 	# Remove the offending bits
 	while read FPATH TYPE; do
 		case ${TYPE} in
 		d)
 			rmdir ${BASEDIR}/${FPATH}
 			;;
 		f)
 			rm ${BASEDIR}/${FPATH}
 			;;
 		L)
 			rm ${BASEDIR}/${FPATH}
 			;;
 		esac
 	done < killfiles
 
 	# Clean up
 	rm newfiles killfiles
 }
 
 # Install new files, delete old files, and update linker.hints
 install_files () {
 	# If we haven't already dealt with the kernel, deal with it.
 	if ! [ -f $1/kerneldone ]; then
 		grep -E '^/boot/' $1/INDEX-OLD > INDEX-OLD
 		grep -E '^/boot/' $1/INDEX-NEW > INDEX-NEW
 
 		# Backup current kernel before installing a new one
 		backup_kernel || return 1
 
 		# Install new files
 		install_from_index INDEX-NEW || return 1
 
 		# Remove files which need to be deleted
 		install_delete INDEX-OLD INDEX-NEW || return 1
 
 		# Update linker.hints if necessary
 		if [ -s INDEX-OLD -o -s INDEX-NEW ]; then
 			kldxref -R ${BASEDIR}/boot/ 2>/dev/null
 		fi
 
 		# We've finished updating the kernel.
 		touch $1/kerneldone
 
 		# Do we need to ask for a reboot now?
 		if [ -f $1/kernelfirst ] &&
 		    [ -s INDEX-OLD -o -s INDEX-NEW ]; then
 			cat <<-EOF
 
 Kernel updates have been installed.  Please reboot and run
 "$0 install" again to finish installing updates.
 			EOF
 			exit 0
 		fi
 	fi
 
 	# If we haven't already dealt with the world, deal with it.
 	if ! [ -f $1/worlddone ]; then
 		# Create any necessary directories first
 		grep -vE '^/boot/' $1/INDEX-NEW |
 		    grep -E '^[^|]+\|d\|' > INDEX-NEW
 		install_from_index INDEX-NEW || return 1
 
 		# Install new runtime linker
 		grep -vE '^/boot/' $1/INDEX-NEW |
 		    grep -vE '^[^|]+\|d\|' |
 		    grep -E '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' > INDEX-NEW
 		install_from_index INDEX-NEW || return 1
 
 		# Install new shared libraries next
 		grep -vE '^/boot/' $1/INDEX-NEW |
 		    grep -vE '^[^|]+\|d\|' |
 		    grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' |
 		    grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW
 		install_from_index INDEX-NEW || return 1
 
 		# Deal with everything else
 		grep -vE '^/boot/' $1/INDEX-OLD |
 		    grep -vE '^[^|]+\|d\|' |
 		    grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' |
 		    grep -vE '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-OLD
 		grep -vE '^/boot/' $1/INDEX-NEW |
 		    grep -vE '^[^|]+\|d\|' |
 		    grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' |
 		    grep -vE '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW
 		install_from_index INDEX-NEW || return 1
 		install_delete INDEX-OLD INDEX-NEW || return 1
 
 		# Rebuild generated pwd files.
 		if [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/spwd.db ] ||
 		    [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/pwd.db ] ||
 		    [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/passwd ]; then
 			pwd_mkdb -d ${BASEDIR}/etc -p ${BASEDIR}/etc/master.passwd
 		fi
 
 		# Rebuild /etc/login.conf.db if necessary.
 		if [ ${BASEDIR}/etc/login.conf -nt ${BASEDIR}/etc/login.conf.db ]; then
 			cap_mkdb ${BASEDIR}/etc/login.conf
 		fi
 
 		# Rebuild man page databases, if necessary.
 		for D in /usr/share/man /usr/share/openssl/man; do
 			if [ ! -d ${BASEDIR}/$D ]; then
 				continue
 			fi
 			if [ -z "$(find ${BASEDIR}/$D -type f -newer ${BASEDIR}/$D/mandoc.db)" ]; then
 				continue;
 			fi
 			makewhatis ${BASEDIR}/$D
 		done
 
 		# We've finished installing the world and deleting old files
 		# which are not shared libraries.
 		touch $1/worlddone
 
 		# Do we need to ask the user to portupgrade now?
 		grep -vE '^/boot/' $1/INDEX-NEW |
 		    grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' |
 		    cut -f 1 -d '|' |
 		    sort > newfiles
 		if grep -vE '^/boot/' $1/INDEX-OLD |
 		    grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' |
 		    cut -f 1 -d '|' |
 		    sort |
 		    join -v 1 - newfiles |
 		    grep -q .; then
 			cat <<-EOF
 
 Completing this upgrade requires removing old shared object files.
 Please rebuild all installed 3rd party software (e.g., programs
 installed from the ports tree) and then run "$0 install"
 again to finish installing updates.
 			EOF
 			rm newfiles
 			exit 0
 		fi
 		rm newfiles
 	fi
 
 	# Remove old shared libraries
 	grep -vE '^/boot/' $1/INDEX-NEW |
 	    grep -vE '^[^|]+\|d\|' |
 	    grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW
 	grep -vE '^/boot/' $1/INDEX-OLD |
 	    grep -vE '^[^|]+\|d\|' |
 	    grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-OLD
 	install_delete INDEX-OLD INDEX-NEW || return 1
 
 	# Remove old directories
 	grep -vE '^/boot/' $1/INDEX-NEW |
 	    grep -E '^[^|]+\|d\|' > INDEX-NEW
 	grep -vE '^/boot/' $1/INDEX-OLD |
 	    grep -E '^[^|]+\|d\|' > INDEX-OLD
 	install_delete INDEX-OLD INDEX-NEW || return 1
 
 	# Remove temporary files
 	rm INDEX-OLD INDEX-NEW
 }
 
 # Rearrange bits to allow the installed updates to be rolled back
 install_setup_rollback () {
 	# Remove the "reboot after installing kernel", "kernel updated", and
 	# "finished installing the world" flags if present -- they are
 	# irrelevant when rolling back updates.
 	if [ -f ${BDHASH}-install/kernelfirst ]; then
 		rm ${BDHASH}-install/kernelfirst
 		rm ${BDHASH}-install/kerneldone
 	fi
 	if [ -f ${BDHASH}-install/worlddone ]; then
 		rm ${BDHASH}-install/worlddone
 	fi
 
 	if [ -L ${BDHASH}-rollback ]; then
 		mv ${BDHASH}-rollback ${BDHASH}-install/rollback
 	fi
 
 	mv ${BDHASH}-install ${BDHASH}-rollback
 }
 
 # Actually install updates
 install_run () {
 	echo -n "Installing updates..."
 
 	# Make sure we have all the files we should have
 	install_verify ${BDHASH}-install/INDEX-OLD	\
 	    ${BDHASH}-install/INDEX-NEW || return 1
 
 	# Remove system immutable flag from files
 	install_unschg ${BDHASH}-install/INDEX-OLD	\
 	    ${BDHASH}-install/INDEX-NEW || return 1
 
 	# Install new files, delete old files, and update linker.hints
 	install_files ${BDHASH}-install || return 1
 
 	# Rearrange bits to allow the installed updates to be rolled back
 	install_setup_rollback
 
 	echo " done."
 }
 
 # Rearrange bits to allow the previous set of updates to be rolled back next.
 rollback_setup_rollback () {
 	if [ -L ${BDHASH}-rollback/rollback ]; then
 		mv ${BDHASH}-rollback/rollback rollback-tmp
 		rm -r ${BDHASH}-rollback/
 		rm ${BDHASH}-rollback
 		mv rollback-tmp ${BDHASH}-rollback
 	else
 		rm -r ${BDHASH}-rollback/
 		rm ${BDHASH}-rollback
 	fi
 }
 
 # Install old files, delete new files, and update linker.hints
 rollback_files () {
 	# Install old shared library files which don't have the same path as
 	# a new shared library file.
 	grep -vE '^/boot/' $1/INDEX-NEW |
 	    grep -E '/lib/.*\.so\.[0-9]+\|' |
 	    cut -f 1 -d '|' |
 	    sort > INDEX-NEW.libs.flist
 	grep -vE '^/boot/' $1/INDEX-OLD |
 	    grep -E '/lib/.*\.so\.[0-9]+\|' |
 	    sort -k 1,1 -t '|' - |
 	    join -t '|' -v 1 - INDEX-NEW.libs.flist > INDEX-OLD
 	install_from_index INDEX-OLD || return 1
 
 	# Deal with files which are neither kernel nor shared library
 	grep -vE '^/boot/' $1/INDEX-OLD |
 	    grep -vE '/lib/.*\.so\.[0-9]+\|' > INDEX-OLD
 	grep -vE '^/boot/' $1/INDEX-NEW |
 	    grep -vE '/lib/.*\.so\.[0-9]+\|' > INDEX-NEW
 	install_from_index INDEX-OLD || return 1
 	install_delete INDEX-NEW INDEX-OLD || return 1
 
 	# Install any old shared library files which we didn't install above.
 	grep -vE '^/boot/' $1/INDEX-OLD |
 	    grep -E '/lib/.*\.so\.[0-9]+\|' |
 	    sort -k 1,1 -t '|' - |
 	    join -t '|' - INDEX-NEW.libs.flist > INDEX-OLD
 	install_from_index INDEX-OLD || return 1
 
 	# Delete unneeded shared library files
 	grep -vE '^/boot/' $1/INDEX-OLD |
 	    grep -E '/lib/.*\.so\.[0-9]+\|' > INDEX-OLD
 	grep -vE '^/boot/' $1/INDEX-NEW |
 	    grep -E '/lib/.*\.so\.[0-9]+\|' > INDEX-NEW
 	install_delete INDEX-NEW INDEX-OLD || return 1
 
 	# Deal with kernel files
 	grep -E '^/boot/' $1/INDEX-OLD > INDEX-OLD
 	grep -E '^/boot/' $1/INDEX-NEW > INDEX-NEW
 	install_from_index INDEX-OLD || return 1
 	install_delete INDEX-NEW INDEX-OLD || return 1
 	if [ -s INDEX-OLD -o -s INDEX-NEW ]; then
 		kldxref -R /boot/ 2>/dev/null
 	fi
 
 	# Remove temporary files
 	rm INDEX-OLD INDEX-NEW INDEX-NEW.libs.flist
 }
 
 # Actually rollback updates
 rollback_run () {
 	echo -n "Uninstalling updates..."
 
 	# If there are updates waiting to be installed, remove them; we
 	# want the user to re-run 'fetch' after rolling back updates.
 	if [ -L ${BDHASH}-install ]; then
 		rm -r ${BDHASH}-install/
 		rm ${BDHASH}-install
 	fi
 
 	# Make sure we have all the files we should have
 	install_verify ${BDHASH}-rollback/INDEX-NEW	\
 	    ${BDHASH}-rollback/INDEX-OLD || return 1
 
 	# Remove system immutable flag from files
 	install_unschg ${BDHASH}-rollback/INDEX-NEW	\
 	    ${BDHASH}-rollback/INDEX-OLD || return 1
 
 	# Install old files, delete new files, and update linker.hints
 	rollback_files ${BDHASH}-rollback || return 1
 
 	# Remove the rollback directory and the symlink pointing to it; and
 	# rearrange bits to allow the previous set of updates to be rolled
 	# back next.
 	rollback_setup_rollback
 
 	echo " done."
 }
 
 # Compare INDEX-ALL and INDEX-PRESENT and print warnings about differences.
 IDS_compare () {
 	# Get all the lines which mismatch in something other than file
 	# flags.  We ignore file flags because sysinstall doesn't seem to
 	# set them when it installs FreeBSD; warning about these adds a
 	# very large amount of noise.
 	cut -f 1-5,7-8 -d '|' $1 > $1.noflags
 	sort -k 1,1 -t '|' $1.noflags > $1.sorted
 	cut -f 1-5,7-8 -d '|' $2 |
 	    comm -13 $1.noflags - |
 	    fgrep -v '|-|||||' |
 	    sort -k 1,1 -t '|' |
 	    join -t '|' $1.sorted - > INDEX-NOTMATCHING
 
 	# Ignore files which match IDSIGNOREPATHS.
 	for X in ${IDSIGNOREPATHS}; do
 		grep -E "^${X}" INDEX-NOTMATCHING
 	done |
 	    sort -u |
 	    comm -13 - INDEX-NOTMATCHING > INDEX-NOTMATCHING.tmp
 	mv INDEX-NOTMATCHING.tmp INDEX-NOTMATCHING
 
 	# Go through the lines and print warnings.
 	local IFS='|'
 	while read FPATH TYPE OWNER GROUP PERM HASH LINK P_TYPE P_OWNER P_GROUP P_PERM P_HASH P_LINK; do
 		# Warn about different object types.
 		if ! [ "${TYPE}" = "${P_TYPE}" ]; then
 			echo -n "${FPATH} is a "
 			case "${P_TYPE}" in
 			f)	echo -n "regular file, "
 				;;
 			d)	echo -n "directory, "
 				;;
 			L)	echo -n "symlink, "
 				;;
 			esac
 			echo -n "but should be a "
 			case "${TYPE}" in
 			f)	echo -n "regular file."
 				;;
 			d)	echo -n "directory."
 				;;
 			L)	echo -n "symlink."
 				;;
 			esac
 			echo
 
 			# Skip other tests, since they don't make sense if
 			# we're comparing different object types.
 			continue
 		fi
 
 		# Warn about different owners.
 		if ! [ "${OWNER}" = "${P_OWNER}" ]; then
 			echo -n "${FPATH} is owned by user id ${P_OWNER}, "
 			echo "but should be owned by user id ${OWNER}."
 		fi
 
 		# Warn about different groups.
 		if ! [ "${GROUP}" = "${P_GROUP}" ]; then
 			echo -n "${FPATH} is owned by group id ${P_GROUP}, "
 			echo "but should be owned by group id ${GROUP}."
 		fi
 
 		# Warn about different permissions.  We do not warn about
 		# different permissions on symlinks, since some archivers
 		# don't extract symlink permissions correctly and they are
 		# ignored anyway.
 		if ! [ "${PERM}" = "${P_PERM}" ] &&
 		    ! [ "${TYPE}" = "L" ]; then
 			echo -n "${FPATH} has ${P_PERM} permissions, "
 			echo "but should have ${PERM} permissions."
 		fi
 
 		# Warn about different file hashes / symlink destinations.
 		if ! [ "${HASH}" = "${P_HASH}" ]; then
 			if [ "${TYPE}" = "L" ]; then
 				echo -n "${FPATH} is a symlink to ${P_HASH}, "
 				echo "but should be a symlink to ${HASH}."
 			fi
 			if [ "${TYPE}" = "f" ]; then
 				echo -n "${FPATH} has SHA256 hash ${P_HASH}, "
 				echo "but should have SHA256 hash ${HASH}."
 			fi
 		fi
 
 		# We don't warn about different hard links, since some
 		# some archivers break hard links, and as long as the
 		# underlying data is correct they really don't matter.
 	done < INDEX-NOTMATCHING
 
 	# Clean up
 	rm $1 $1.noflags $1.sorted $2 INDEX-NOTMATCHING
 }
 
 # Do the work involved in comparing the system to a "known good" index
 IDS_run () {
 	workdir_init || return 1
 
 	# Prepare the mirror list.
 	fetch_pick_server_init && fetch_pick_server
 
 	# Try to fetch the public key until we run out of servers.
 	while ! fetch_key; do
 		fetch_pick_server || return 1
 	done
  
 	# Try to fetch the metadata index signature ("tag") until we run
 	# out of available servers; and sanity check the downloaded tag.
 	while ! fetch_tag; do
 		fetch_pick_server || return 1
 	done
 	fetch_tagsanity || return 1
 
 	# Fetch INDEX-OLD and INDEX-ALL.
 	fetch_metadata INDEX-OLD INDEX-ALL || return 1
 
 	# Generate filtered INDEX-OLD and INDEX-ALL files containing only
 	# the components we want and without anything marked as "Ignore".
 	fetch_filter_metadata INDEX-OLD || return 1
 	fetch_filter_metadata INDEX-ALL || return 1
 
 	# Merge the INDEX-OLD and INDEX-ALL files into INDEX-ALL.
 	sort INDEX-OLD INDEX-ALL > INDEX-ALL.tmp
 	mv INDEX-ALL.tmp INDEX-ALL
 	rm INDEX-OLD
 
 	# Translate /boot/${KERNCONF} to ${KERNELDIR}
 	fetch_filter_kernel_names INDEX-ALL ${KERNCONF}
 
 	# Inspect the system and generate an INDEX-PRESENT file.
 	fetch_inspect_system INDEX-ALL INDEX-PRESENT /dev/null || return 1
 
 	# Compare INDEX-ALL and INDEX-PRESENT and print warnings about any
 	# differences.
 	IDS_compare INDEX-ALL INDEX-PRESENT
 }
 
 #### Main functions -- call parameter-handling and core functions
 
 # Using the command line, configuration file, and defaults,
 # set all the parameters which are needed later.
 get_params () {
 	init_params
 	parse_cmdline $@
 	parse_conffile
 	default_params
+	finalize_components_config ${COMPONENTS}
 }
 
 # Fetch command.  Make sure that we're being called
 # interactively, then run fetch_check_params and fetch_run
 cmd_fetch () {
 	if [ ! -t 0 -a $NOTTYOK -eq 0 ]; then
 		echo -n "`basename $0` fetch should not "
 		echo "be run non-interactively."
 		echo "Run `basename $0` cron instead."
 		exit 1
 	fi
 	fetch_check_params
 	fetch_run || exit 1
 	ISFETCHED=1
 }
 
 # Cron command.  Make sure the parameters are sensible; wait
 # rand(3600) seconds; then fetch updates.  While fetching updates,
 # send output to a temporary file; only print that file if the
 # fetching failed.
 cmd_cron () {
 	fetch_check_params
 	sleep `jot -r 1 0 3600`
 
 	TMPFILE=`mktemp /tmp/freebsd-update.XXXXXX` || exit 1
 	if ! fetch_run >> ${TMPFILE} ||
 	    ! grep -q "No updates needed" ${TMPFILE} ||
 	    [ ${VERBOSELEVEL} = "debug" ]; then
 		mail -s "`hostname` security updates" ${MAILTO} < ${TMPFILE}
 	fi
 
 	rm ${TMPFILE}
 }
 
 # Fetch files for upgrading to a new release.
 cmd_upgrade () {
 	upgrade_check_params
 	upgrade_run || exit 1
 }
 
 # Install downloaded updates.
 cmd_install () {
 	install_check_params
 	install_run || exit 1
 }
 
 # Rollback most recently installed updates.
 cmd_rollback () {
 	rollback_check_params
 	rollback_run || exit 1
 }
 
 # Compare system against a "known good" index.
 cmd_IDS () {
 	IDS_check_params
 	IDS_run || exit 1
 }
 
 #### Entry point
 
 # Make sure we find utilities from the base system
 export PATH=/sbin:/bin:/usr/sbin:/usr/bin:${PATH}
 
 # Set a pager if the user doesn't
 if [ -z "$PAGER" ]; then
 	PAGER=/usr/bin/less
 fi
 
 # Set LC_ALL in order to avoid problems with character ranges like [A-Z].
 export LC_ALL=C
 
 get_params $@
 for COMMAND in ${COMMANDS}; do
 	cmd_${COMMAND}
 done
Index: projects/clang900-import/usr.sbin/rpc.statd/Makefile
===================================================================
--- projects/clang900-import/usr.sbin/rpc.statd/Makefile	(revision 352586)
+++ projects/clang900-import/usr.sbin/rpc.statd/Makefile	(revision 352587)
@@ -1,26 +1,28 @@
 # $FreeBSD$
 
 PROG=	rpc.statd
 MAN=	rpc.statd.8
 SRCS=	file.c sm_inter_svc.c sm_inter.h statd.c procs.c
 
-CFLAGS+= -I.
-WARNS?= 2
-
+CFLAGS+= -I${.OBJDIR}
 LIBADD=	rpcsvc
 
+# XXX: mismatch between (xdrproc_t) and xdr_void().
+WARNS?=	2
+
 CLEANFILES= sm_inter_svc.c sm_inter.h
 
 RPCSRC= ${SYSROOT:U${DESTDIR}}/usr/include/rpcsvc/sm_inter.x
 RPCGEN= RPCGEN_CPP=${CPP:Q} rpcgen -L -C
 
 sm_inter_svc.c: ${RPCSRC}
 	${RPCGEN} -m -o ${.TARGET} ${.ALLSRC}
 
 sm_inter.h: ${RPCSRC}
 	${RPCGEN} -h -o ${.TARGET} ${.ALLSRC}
 
-test: test.c
-	cc -o test test.c -lrpcsvc
+test: test.o
+	${CC} ${LDFLAGS} -o ${.TARGET} ${.ALLSRC} ${LIBADD:S/^/-l/}
+CLEANFILES+=	test test.o
 
 .include <bsd.prog.mk>
Index: projects/clang900-import/usr.sbin/rpc.statd/file.c
===================================================================
--- projects/clang900-import/usr.sbin/rpc.statd/file.c	(revision 352586)
+++ projects/clang900-import/usr.sbin/rpc.statd/file.c	(revision 352587)
@@ -1,369 +1,369 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1995
  *	A.R. Gordon (andrew.gordon@net-tel.co.uk).  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the FreeBSD project
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY ANDREW GORDON AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <netdb.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/mman.h>		/* For mmap()				*/
 #include <rpc/rpc.h>
 #include <syslog.h>
 #include <stdlib.h>
 
 #include "statd.h"
 
 FileLayout *status_info;	/* Pointer to the mmap()ed status file	*/
 static int status_fd;		/* File descriptor for the open file	*/
 static off_t status_file_len;	/* Current on-disc length of file	*/
 
 /* sync_file --------------------------------------------------------------- */
 /*
    Purpose:	Packaged call of msync() to flush changes to mmap()ed file
    Returns:	Nothing.  Errors to syslog.
 */
 
 void sync_file(void)
 {
   if (msync((void *)status_info, 0, 0) < 0)
   {
     syslog(LOG_ERR, "msync() failed: %s", strerror(errno));
   }
 }
 
 /* find_host -------------------------------------------------------------- */
 /*
    Purpose:	Find the entry in the status file for a given host
    Returns:	Pointer to that entry in the mmap() region, or NULL.
    Notes:	Also creates entries if requested.
 		Failure to create also returns NULL.
 */
 
 HostInfo *find_host(char *hostname, int create)
 {
   HostInfo *hp;
   HostInfo *spare_slot = NULL;
   HostInfo *result = NULL;
   struct addrinfo *ai1, *ai2;
   int i;
 
   ai2 = NULL;
   if (getaddrinfo(hostname, NULL, NULL, &ai1) != 0)
     ai1 = NULL;
   for (i = 0, hp = status_info->hosts; i < status_info->noOfHosts; i++, hp++)
   {
     if (!strncasecmp(hostname, hp->hostname, SM_MAXSTRLEN))
     {
       result = hp;
       break;
     }
     if (hp->hostname[0] != '\0' &&
 	getaddrinfo(hp->hostname, NULL, NULL, &ai2) != 0)
       ai2 = NULL;
     if (ai1 && ai2)
     {
        struct addrinfo *p1, *p2;
        for (p1 = ai1; !result && p1; p1 = p1->ai_next)
        {
 	 for (p2 = ai2; !result && p2; p2 = p2->ai_next)
 	 {
 	   if (p1->ai_family == p2->ai_family
 	       && p1->ai_addrlen == p2->ai_addrlen
 	       && !memcmp(p1->ai_addr, p2->ai_addr, p1->ai_addrlen))
 	   {
 	     result = hp;
 	     break;
 	   }
 	 }
        }
        if (result)
 	 break;
     }
     if (ai2) {
       freeaddrinfo(ai2);
       ai2 = NULL;
     }
     if (!spare_slot && !hp->monList && !hp->notifyReqd)
       spare_slot = hp;
   }
   if (ai1)
     freeaddrinfo(ai1);
 
   /* Return if entry found, or if not asked to create one.		*/
   if (result || !create) return (result);
 
   /* Now create an entry, using the spare slot if one was found or	*/
   /* adding to the end of the list otherwise, extending file if reqd	*/
   if (!spare_slot)
   {
     off_t desired_size;
     spare_slot = &status_info->hosts[status_info->noOfHosts];
     desired_size = ((char*)spare_slot - (char*)status_info) + sizeof(HostInfo);
     if (desired_size > status_file_len)
     {
       /* Extend file by writing 1 byte of junk at the desired end pos	*/
       if (lseek(status_fd, desired_size - 1, SEEK_SET) == -1 ||
           write(status_fd, "\0", 1) < 0)
       {
 	syslog(LOG_ERR, "Unable to extend status file");
 	return (NULL);
       }
       status_file_len = desired_size;
     }
     status_info->noOfHosts++;
   }
 
   /* Initialise the spare slot that has been found/created		*/
   /* Note that we do not msync(), since the caller is presumed to be	*/
   /* about to modify the entry further					*/
   memset(spare_slot, 0, sizeof(HostInfo));
   strncpy(spare_slot->hostname, hostname, SM_MAXSTRLEN);
   return (spare_slot);
 }
 
 /* init_file -------------------------------------------------------------- */
 /*
    Purpose:	Open file, create if necessary, initialise it.
    Returns:	Nothing - exits on error
    Notes:	Called before process becomes daemon, hence logs to
 		stderr rather than syslog.
 		Opens the file, then mmap()s it for ease of access.
 		Also performs initial clean-up of the file, zeroing
 		monitor list pointers, setting the notifyReqd flag in
 		all hosts that had a monitor list, and incrementing
 		the state number to the next even value.
 */
 
 void init_file(const char *filename)
 {
   int new_file = FALSE;
   char buf[HEADER_LEN];
   int i;
 
   /* try to open existing file - if not present, create one		*/
   status_fd = open(filename, O_RDWR);
   if ((status_fd < 0) && (errno == ENOENT))
   {
     status_fd = open(filename, O_RDWR | O_CREAT, 0644);
     new_file = TRUE;
   }
   if (status_fd < 0)
     errx(1, "unable to open status file %s", filename);
 
   /* File now open.  mmap() it, with a generous size to allow for	*/
   /* later growth, where we will extend the file but not re-map it.	*/
   status_info = (FileLayout *)
     mmap(NULL, 0x10000000, PROT_READ | PROT_WRITE, MAP_SHARED, status_fd, 0);
 
   if (status_info == (FileLayout *) MAP_FAILED)
     err(1, "unable to mmap() status file");
 
   status_file_len = lseek(status_fd, 0L, SEEK_END);
 
   /* If the file was not newly created, validate the contents, and if	*/
   /* defective, re-create from scratch.					*/
   if (!new_file)
   {
-    if ((status_file_len < HEADER_LEN) || (status_file_len
-      < (HEADER_LEN + sizeof(HostInfo) * status_info->noOfHosts)) )
+    if ((status_file_len < (off_t)HEADER_LEN) || (status_file_len
+      < (off_t)(HEADER_LEN + sizeof(HostInfo) * status_info->noOfHosts)) )
     {
       warnx("status file is corrupt");
       new_file = TRUE;
     }
   }
 
   /* Initialisation of a new, empty file.				*/
   if (new_file)
   {
     memset(buf, 0, sizeof(buf));
     lseek(status_fd, 0L, SEEK_SET);
     write(status_fd, buf, HEADER_LEN);
     status_file_len = HEADER_LEN;
   }
   else
   {
     /* Clean-up of existing file - monitored hosts will have a pointer	*/
     /* to a list of clients, which refers to memory in the previous	*/
     /* incarnation of the program and so are meaningless now.  These	*/
     /* pointers are zeroed and the fact that the host was previously	*/
     /* monitored is recorded by setting the notifyReqd flag, which will	*/
     /* in due course cause a SM_NOTIFY to be sent.			*/
     /* Note that if we crash twice in quick succession, some hosts may	*/
     /* already have notifyReqd set, where we didn't manage to notify	*/
     /* them before the second crash occurred.				*/
     for (i = 0; i < status_info->noOfHosts; i++)
     {
       HostInfo *this_host = &status_info->hosts[i];
 
       if (this_host->monList)
       {
 	this_host->notifyReqd = TRUE;
 	this_host->monList = NULL;
       }
     }
     /* Select the next higher even number for the state counter		*/
     status_info->ourState = (status_info->ourState + 2) & 0xfffffffe;
 /*???????******/ status_info->ourState++;
   }
 }
 
 /* notify_one_host --------------------------------------------------------- */
 /*
    Purpose:	Perform SM_NOTIFY procedure at specified host
    Returns:	TRUE if success, FALSE if failed.
    Notes:	Only report failure if verbose is non-zero. Caller will
 		only set verbose to non-zero for the first attempt to
 		contact the host.
 */
 
 static int notify_one_host(char *hostname, int verbose)
 {
   struct timeval timeout = { 20, 0 };	/* 20 secs timeout		*/
   CLIENT *cli;
   char dummy; 
   stat_chge arg;
   char our_hostname[SM_MAXSTRLEN+1];
 
   gethostname(our_hostname, sizeof(our_hostname));
   our_hostname[SM_MAXSTRLEN] = '\0';
   arg.mon_name = our_hostname;
   arg.state = status_info->ourState;
 
   if (debug) syslog (LOG_DEBUG, "Sending SM_NOTIFY to host %s from %s", hostname, our_hostname);
 
   cli = clnt_create(hostname, SM_PROG, SM_VERS, "udp");
   if (!cli)
   {
     syslog(LOG_ERR, "Failed to contact host %s%s", hostname,
       clnt_spcreateerror(""));
     return (FALSE);
   }
 
   if (clnt_call(cli, SM_NOTIFY, (xdrproc_t)xdr_stat_chge, &arg,
       (xdrproc_t)xdr_void, &dummy, timeout)
     != RPC_SUCCESS)
   {
     if (verbose)
       syslog(LOG_ERR, "Failed to contact rpc.statd at host %s", hostname);
     clnt_destroy(cli);
     return (FALSE);
   }
 
   clnt_destroy(cli);
   return (TRUE);
 }
 
 /* notify_hosts ------------------------------------------------------------ */
 /*
    Purpose:	Send SM_NOTIFY to all hosts marked as requiring it
    Returns:	Nothing, immediately - forks a process to do the work.
    Notes:	Does nothing if there are no monitored hosts.
 		Called after all the initialisation has been done - 
 		logs to syslog.
 */
 
 void notify_hosts(void)
 {
   int i;
   int attempts;
   int work_to_do = FALSE;
   HostInfo *hp;
   pid_t pid;
 
   /* First check if there is in fact any work to do.			*/
   for (i = status_info->noOfHosts, hp = status_info->hosts; i ; i--, hp++)
   {
     if (hp->notifyReqd)
     {
       work_to_do = TRUE;
       break;
     }
   }
 
   if (!work_to_do) return;	/* No work found			*/
 
   pid = fork();
   if (pid == -1)
   {
     syslog(LOG_ERR, "Unable to fork notify process - %s", strerror(errno));
     return;
   }
   if (pid) return;
 
   /* Here in the child process.  We continue until all the hosts marked	*/
   /* as requiring notification have been duly notified.			*/
   /* If one of the initial attempts fails, we sleep for a while and	*/
   /* have another go.  This is necessary because when we have crashed,	*/
   /* (eg. a power outage) it is quite possible that we won't be able to	*/
   /* contact all monitored hosts immediately on restart, either because	*/
   /* they crashed too and take longer to come up (in which case the	*/
   /* notification isn't really required), or more importantly if some	*/
   /* router etc. needed to reach the monitored host has not come back	*/
   /* up yet.  In this case, we will be a bit late in re-establishing	*/
   /* locks (after the grace period) but that is the best we can do.	*/
   /* We try 10 times at 5 sec intervals, 10 more times at 1 minute	*/
   /* intervals, then 24 more times at hourly intervals, finally		*/
   /* giving up altogether if the host hasn't come back to life after	*/
   /* 24 hours.								*/
 
   for (attempts = 0; attempts < 44; attempts++)
   {
     work_to_do = FALSE;		/* Unless anything fails		*/
     for (i = status_info->noOfHosts, hp = status_info->hosts; i ; i--, hp++)
     {
       if (hp->notifyReqd)
       {
         if (notify_one_host(hp->hostname, attempts == 0))
 	{
 	  hp->notifyReqd = FALSE;
           sync_file();
 	}
 	else work_to_do = TRUE;
       }
     }
     if (!work_to_do) break;
     if (attempts < 10) sleep(5);
     else if (attempts < 20) sleep(60);
     else sleep(60*60);
   }
   exit(0);
 }
 
 
Index: projects/clang900-import/usr.sbin/rpc.statd/procs.c
===================================================================
--- projects/clang900-import/usr.sbin/rpc.statd/procs.c	(revision 352586)
+++ projects/clang900-import/usr.sbin/rpc.statd/procs.c	(revision 352587)
@@ -1,438 +1,439 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1995
  *	A.R. Gordon (andrew.gordon@net-tel.co.uk).  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the FreeBSD project
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY ANDREW GORDON AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <rpc/rpc.h>
 #include <syslog.h>
 #include <vis.h>
 #include <netdb.h>	/* for getaddrinfo()		*/
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 
 #include "statd.h"
 
+static const char *from_addr(struct sockaddr *);
+
 static const char *
-from_addr(saddr)
-	struct sockaddr *saddr;
+from_addr(struct sockaddr *saddr)
 {
 	static char inet_buf[INET6_ADDRSTRLEN];
 
 	if (getnameinfo(saddr, saddr->sa_len, inet_buf, sizeof(inet_buf),
 			NULL, 0, NI_NUMERICHOST) == 0)
 		return inet_buf;
 	return "???";
 }
 
 /* sm_check_hostname -------------------------------------------------------- */
 /*
  * Purpose: Check `mon_name' member of sm_name struct to ensure that the array
  * consists only of printable characters.
  *
  * Returns: TRUE if hostname is good. FALSE if hostname contains binary or
  * otherwise non-printable characters.
  *
  * Notes: Will syslog(3) to warn of corrupt hostname.
  */
 
 int sm_check_hostname(struct svc_req *req, char *arg)
 {
   int len, dstlen, ret;
   struct sockaddr *claddr;
   char *dst;
 
   len = strlen(arg);
   dstlen = (4 * len) + 1;
   dst = malloc(dstlen);
   claddr = (struct sockaddr *) (svc_getrpccaller(req->rq_xprt)->buf) ;
   ret = 1;
 
   if (claddr == NULL || dst == NULL)
   {
     ret = 0;
   }
   else if (strvis(dst, arg, VIS_WHITE) != len)
   {
     syslog(LOG_ERR,
 	"sm_stat: client %s hostname %s contained invalid characters.",
 	from_addr(claddr),
 	dst);
     ret = 0;
   }
   free(dst);
   return (ret);
 }
 
 /*  sm_stat_1 --------------------------------------------------------------- */
 /*
    Purpose:	RPC call to enquire if a host can be monitored
    Returns:	TRUE for any hostname that can be looked up to give
 		an address.
 */
 
 struct sm_stat_res *sm_stat_1_svc(sm_name *arg, struct svc_req *req)
 {
   static sm_stat_res res;
   struct addrinfo *ai;
   struct sockaddr *claddr;
   static int err;
 
   err = 1;
   if ((err = sm_check_hostname(req, arg->mon_name)) == 0)
   {
     res.res_stat = stat_fail;
   }
   if (err != 0)
   {
     if (debug)
 	    syslog(LOG_DEBUG, "stat called for host %s", arg->mon_name);
     if (getaddrinfo(arg->mon_name, NULL, NULL, &ai) == 0) {
 	    res.res_stat = stat_succ;
 	    freeaddrinfo(ai);
     }
     else
     {
       claddr = (struct sockaddr *) (svc_getrpccaller(req->rq_xprt)->buf) ;
       syslog(LOG_ERR, "invalid hostname to sm_stat from %s: %s",
 	  from_addr(claddr), arg->mon_name);
       res.res_stat = stat_fail;
     }
   }
   res.state = status_info->ourState;
   return (&res);
 }
 
 /* sm_mon_1 ---------------------------------------------------------------- */
 /*
    Purpose:	RPC procedure to establish a monitor request
    Returns:	Success, unless lack of resources prevents
 		the necessary structures from being set up
 		to record the request, or if the hostname is not
 		valid (as judged by getaddrinfo())
 */
 
 struct sm_stat_res *sm_mon_1_svc(mon *arg, struct svc_req *req)
 {
   static sm_stat_res res;
   HostInfo *hp;
   static int err;
   MonList *lp;
   struct addrinfo *ai;
 
   if ((err = sm_check_hostname(req, arg->mon_id.mon_name)) == 0)
   {
     res.res_stat = stat_fail;
   }
 
   if (err != 0)
   {
     if (debug)
     {
       syslog(LOG_DEBUG, "monitor request for host %s", arg->mon_id.mon_name);
       syslog(LOG_DEBUG, "recall host: %s prog: %d ver: %d proc: %d",
       arg->mon_id.my_id.my_name,
       arg->mon_id.my_id.my_prog,
       arg->mon_id.my_id.my_vers,
       arg->mon_id.my_id.my_proc);
     }
     res.res_stat = stat_fail;  /* Assume fail until set otherwise      */
     res.state = status_info->ourState;
   
     /* Find existing host entry, or create one if not found            */
     /* If find_host() fails, it will have logged the error already.    */
     if (getaddrinfo(arg->mon_id.mon_name, NULL, NULL, &ai) != 0)
     {
       syslog(LOG_ERR, "Invalid hostname to sm_mon: %s", arg->mon_id.mon_name);
       return (&res);
     }
     freeaddrinfo(ai);
     if ((hp = find_host(arg->mon_id.mon_name, TRUE)))
     {
       lp = (MonList *)malloc(sizeof(MonList));
       if (!lp)
       {
         syslog(LOG_ERR, "Out of memory");
       }
       else
       {
         strncpy(lp->notifyHost, arg->mon_id.my_id.my_name, SM_MAXSTRLEN);
         lp->notifyProg = arg->mon_id.my_id.my_prog;
         lp->notifyVers = arg->mon_id.my_id.my_vers;
         lp->notifyProc = arg->mon_id.my_id.my_proc;
         memcpy(lp->notifyData, arg->priv, sizeof(lp->notifyData));
 
         lp->next = hp->monList;
         hp->monList = lp;
         sync_file();
 
         res.res_stat = stat_succ;      /* Report success                       */
       }
     }
   }
   return (&res);
 }
 
 /* do_unmon ---------------------------------------------------------------- */
 /*
    Purpose:	Remove a monitor request from a host
    Returns:	TRUE if found, FALSE if not found.
    Notes:	Common code from sm_unmon_1_svc and sm_unmon_all_1_svc
 		In the unlikely event of more than one identical monitor
 		request, all are removed.
 */
 
 static int do_unmon(HostInfo *hp, my_id *idp)
 {
   MonList *lp, *next;
   MonList *last = NULL;
   int result = FALSE;
 
   lp = hp->monList;
   while (lp)
   {
     if (!strncasecmp(idp->my_name, lp->notifyHost, SM_MAXSTRLEN)
       && (idp->my_prog == lp->notifyProg) && (idp->my_proc == lp->notifyProc)
       && (idp->my_vers == lp->notifyVers))
     {
       /* found one.  Unhook from chain and free.		*/
       next = lp->next;
       if (last) last->next = next;
       else hp->monList = next;
       free(lp);
       lp = next;
       result = TRUE;
     }
     else
     {
       last = lp;
       lp = lp->next;
     }
   }
   return (result);
 }
 
 /* sm_unmon_1 -------------------------------------------------------------- */
 /*
    Purpose:	RPC procedure to release a monitor request.
    Returns:	Local machine's status number
    Notes:	The supplied mon_id should match the value passed in an
 		earlier call to sm_mon_1
 */
 
 struct sm_stat *sm_unmon_1_svc(mon_id *arg, struct svc_req *req __unused)
 {
   static sm_stat res;
   HostInfo *hp;
 
   if (debug)
   {
     syslog(LOG_DEBUG, "un-monitor request for host %s", arg->mon_name);
     syslog(LOG_DEBUG, "recall host: %s prog: %d ver: %d proc: %d",
       arg->mon_name,
       arg->my_id.my_prog, arg->my_id.my_vers, arg->my_id.my_proc);
   }
 
   if ((hp = find_host(arg->mon_name, FALSE)))
   {
     if (do_unmon(hp, &arg->my_id)) sync_file();
     else
     {
       syslog(LOG_ERR, "unmon request from %s, no matching monitor",
 	arg->my_id.my_name);
     }
   }
   else syslog(LOG_ERR, "unmon request from %s for unknown host %s",
     arg->my_id.my_name, arg->mon_name);
 
   res.state = status_info->ourState;
 
   return (&res);
 }
 
 /* sm_unmon_all_1 ---------------------------------------------------------- */
 /*
    Purpose:	RPC procedure to release monitor requests.
    Returns:	Local machine's status number
    Notes:	Releases all monitor requests (if any) from the specified
 		host and program number.
 */
 
 struct sm_stat *sm_unmon_all_1_svc(my_id *arg, struct svc_req *req __unused)
 {
   static sm_stat res;
   HostInfo *hp;
   int i;
 
   if (debug)
   {
     syslog(LOG_DEBUG, "unmon_all for host: %s prog: %d ver: %d proc: %d",
       arg->my_name, arg->my_prog, arg->my_vers, arg->my_proc);
   }
 
   for (i = status_info->noOfHosts, hp = status_info->hosts; i; i--, hp++)
   {
     do_unmon(hp, arg);
   }
   sync_file();
 
   res.state = status_info->ourState;
 
   return (&res);
 }
 
 /* sm_simu_crash_1 --------------------------------------------------------- */
 /*
    Purpose:	RPC procedure to simulate a crash
    Returns:	Nothing
    Notes:	Standardised mechanism for debug purposes
 		The specification says that we should drop all of our
 		status information (apart from the list of monitored hosts
 		on disc).  However, this would confuse the rpc.lockd
 		which would be unaware that all of its monitor requests
 		had been silently junked.  Hence we in fact retain all
 		current requests and simply increment the status counter
 		and inform all hosts on the monitor list.
 */
 
 void *sm_simu_crash_1_svc(void *v __unused, struct svc_req *req __unused)
 {
   static char dummy;
   int work_to_do;
   HostInfo *hp;
   int i;
 
   work_to_do = FALSE;
   if (debug) syslog(LOG_DEBUG, "simu_crash called!!");
 
   /* Simulate crash by setting notify-required flag on all monitored	*/
   /* hosts, and incrementing our status number.  notify_hosts() is	*/
   /* then called to fork a process to do the notifications.		*/
 
   for (i = status_info->noOfHosts, hp = status_info->hosts; i ; i--, hp++)
   {
     if (hp->monList)
     {
       work_to_do = TRUE;
       hp->notifyReqd = TRUE;
     }
   }
   status_info->ourState += 2;	/* always even numbers if not crashed	*/
 
   if (work_to_do) notify_hosts();
 
   return (&dummy);
 }
 
 /* sm_notify_1 ------------------------------------------------------------- */
 /*
    Purpose:	RPC procedure notifying local statd of the crash of another
    Returns:	Nothing
    Notes:	There is danger of deadlock, since it is quite likely that
 		the client procedure that we call will in turn call us
 		to remove or adjust the monitor request.
 		We therefore fork() a process to do the notifications.
 		Note that the main HostInfo structure is in a mmap()
 		region and so will be shared with the child, but the
 		monList pointed to by the HostInfo is in normal memory.
 		Hence if we read the monList before forking, we are
 		protected from the parent servicing other requests
 		that modify the list.
 */
 
 void *sm_notify_1_svc(stat_chge *arg, struct svc_req *req __unused)
 {
   struct timeval timeout = { 20, 0 };	/* 20 secs timeout		*/
   CLIENT *cli;
   static char dummy; 
   sm_status tx_arg;		/* arg sent to callback procedure	*/
   MonList *lp;
   HostInfo *hp;
   pid_t pid;
 
   if (debug) syslog(LOG_DEBUG, "notify from host %s, new state %d",
     arg->mon_name, arg->state);
 
   hp = find_host(arg->mon_name, FALSE);
   if (!hp)
   {
     /* Never heard of this host - why is it notifying us?		*/
     syslog(LOG_ERR, "Unsolicited notification from host %s", arg->mon_name);
     return (&dummy);
   }
   lp = hp->monList;
   if (!lp) return (&dummy);	/* We know this host, but have no	*/
 				/* outstanding requests.		*/
   pid = fork();
   if (pid == -1)
   {
     syslog(LOG_ERR, "Unable to fork notify process - %s", strerror(errno));
     return (NULL);		/* no answer, the client will retry */
   }
   if (pid) return (&dummy);	/* Parent returns			*/
 
   while (lp)
   {
     tx_arg.mon_name = arg->mon_name;
     tx_arg.state = arg->state;
     memcpy(tx_arg.priv, lp->notifyData, sizeof(tx_arg.priv));
     cli = clnt_create(lp->notifyHost, lp->notifyProg, lp->notifyVers, "udp");
     if (!cli)
     {
       syslog(LOG_ERR, "Failed to contact host %s%s", lp->notifyHost,
         clnt_spcreateerror(""));
     }
     else
     {
       if (clnt_call(cli, lp->notifyProc, (xdrproc_t)xdr_sm_status, &tx_arg,
 	  (xdrproc_t)xdr_void, &dummy, timeout) != RPC_SUCCESS)
       {
         syslog(LOG_ERR, "Failed to call rpc.statd client at host %s",
 	  lp->notifyHost);
       }
       clnt_destroy(cli);
     }
     lp = lp->next;
   }
 
   exit (0);	/* Child quits	*/
 }
Index: projects/clang900-import/usr.sbin/rpc.statd/statd.c
===================================================================
--- projects/clang900-import/usr.sbin/rpc.statd/statd.c	(revision 352586)
+++ projects/clang900-import/usr.sbin/rpc.statd/statd.c	(revision 352587)
@@ -1,657 +1,658 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1995
  *	A.R. Gordon (andrew.gordon@net-tel.co.uk).  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the FreeBSD project
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY ANDREW GORDON AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /* main() function for status monitor daemon.  Some of the code in this	*/
 /* file was generated by running rpcgen /usr/include/rpcsvc/sm_inter.x	*/
 /* The actual program logic is in the file procs.c			*/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <err.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <rpc/rpc.h>
 #include <rpc/rpc_com.h>
 #include <string.h>
 #include <syslog.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/wait.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <netdb.h>
 #include <signal.h>
 #include <unistd.h>
 #include "statd.h"
 
 #define	GETPORT_MAXTRY	20	/* Max tries to get a port # */
 
 int debug = 0;		/* Controls syslog() calls for debug messages	*/
 
-char **hosts, *svcport_str = NULL;
-int nhosts = 0;
-int xcreated = 0;
+static char **hosts, *svcport_str = NULL;
+static int nhosts = 0;
+static int xcreated = 0;
 static int	mallocd_svcport = 0;
 static int	*sock_fd;
 static int	sock_fdcnt;
 static int	sock_fdpos;
 
 static int	create_service(struct netconfig *nconf);
 static void	complete_service(struct netconfig *nconf, char *port_str);
 static void	clearout_service(void);
 static void handle_sigchld(int sig);
 void out_of_mem(void) __dead2;
 
 static void usage(void) __dead2;
 
 int
 main(int argc, char **argv)
 {
   struct sigaction sa;
   struct netconfig *nconf;
   void *nc_handle;
   in_port_t svcport;
   int ch, i, s;
-  char *endptr, **hosts_bak;
+  char *endptr;
+  char **hosts_bak;
   int have_v6 = 1;
   int maxrec = RPC_MAXDATASIZE;
   int attempt_cnt, port_len, port_pos, ret;
   char **port_list;
 
   while ((ch = getopt(argc, argv, "dh:p:")) != -1)
     switch (ch) {
     case 'd':
       debug = 1;
       break;
     case 'h':
       ++nhosts;
       hosts_bak = hosts;
       hosts_bak = realloc(hosts, nhosts * sizeof(char *));
       if (hosts_bak == NULL) {
 	      if (hosts != NULL) {
 		      for (i = 0; i < nhosts; i++) 
 			      free(hosts[i]);
 		      free(hosts);
 		      out_of_mem();
 	      }
       }
       hosts = hosts_bak;
       hosts[nhosts - 1] = strdup(optarg);
       if (hosts[nhosts - 1] == NULL) {
 	      for (i = 0; i < (nhosts - 1); i++) 
 		      free(hosts[i]);
 	      free(hosts);
 	      out_of_mem();
       }
       break;
     case 'p':
       endptr = NULL;
       svcport = (in_port_t)strtoul(optarg, &endptr, 10);
       if (endptr == NULL || *endptr != '\0' || svcport == 0 || 
           svcport >= IPPORT_MAX)
 	usage();
       
       svcport_str = strdup(optarg);
       break;
     default:
       usage();
     }
   argc -= optind;
   argv += optind;
 
   (void)rpcb_unset(SM_PROG, SM_VERS, NULL);
 
   /*
    * Check if IPv6 support is present.
    */
   s = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
   if (s < 0)
       have_v6 = 0;
   else 
       close(s);
 
   rpc_control(RPC_SVC_CONNMAXREC_SET, &maxrec);
 
   /*
    * If no hosts were specified, add a wildcard entry to bind to
    * INADDR_ANY. Otherwise make sure 127.0.0.1 and ::1 are added to the
    * list.
    */
   if (nhosts == 0) {
 	  hosts = malloc(sizeof(char *));
 	  if (hosts == NULL)
 		  out_of_mem();
 
-	  hosts[0] = "*";
+	  hosts[0] = strdup("*");
 	  nhosts = 1;
   } else {
 	  hosts_bak = hosts;
 	  if (have_v6) {
 		  hosts_bak = realloc(hosts, (nhosts + 2) *
 		      sizeof(char *));
 		  if (hosts_bak == NULL) {
 			  for (i = 0; i < nhosts; i++)
 				  free(hosts[i]);
 			  free(hosts);
 			  out_of_mem();
 		  } else
 			  hosts = hosts_bak;
 
 		  nhosts += 2;
-		  hosts[nhosts - 2] = "::1";
+		  hosts[nhosts - 2] = strdup("::1");
 	  } else {
 		  hosts_bak = realloc(hosts, (nhosts + 1) * sizeof(char *));
 		  if (hosts_bak == NULL) {
 			  for (i = 0; i < nhosts; i++)
 				  free(hosts[i]);
 
 			  free(hosts);
 			  out_of_mem();
 		  } else {
 			  nhosts += 1;
 			  hosts = hosts_bak;
 		  }
 	  }
-	  hosts[nhosts - 1] = "127.0.0.1";
+	  hosts[nhosts - 1] = strdup("127.0.0.1");
   }
 
   attempt_cnt = 1;
   sock_fdcnt = 0;
   sock_fd = NULL;
   port_list = NULL;
   port_len = 0;
   nc_handle = setnetconfig();
   while ((nconf = getnetconfig(nc_handle))) {
 	  /* We want to listen only on udp6, tcp6, udp, tcp transports */
 	  if (nconf->nc_flag & NC_VISIBLE) {
 		  /* Skip if there's no IPv6 support */
 		  if (have_v6 == 0 && strcmp(nconf->nc_protofmly, "inet6") == 0) {
 	      /* DO NOTHING */
 		  } else {
 			ret = create_service(nconf);
 			if (ret == 1)
 				/* Ignore this call */
 				continue;
 			if (ret < 0) {
 				/*
 				 * Failed to bind port, so close off
 				 * all sockets created and try again
 				 * if the port# was dynamically
 				 * assigned via bind(2).
 				 */
 				clearout_service();
 				if (mallocd_svcport != 0 &&
 				    attempt_cnt < GETPORT_MAXTRY) {
 					free(svcport_str);
 					svcport_str = NULL;
 					mallocd_svcport = 0;
 				} else {
 					errno = EADDRINUSE;
 					syslog(LOG_ERR,
 					    "bindresvport_sa: %m");
 					exit(1);
 				}
 
 				/* Start over at the first service. */
 				free(sock_fd);
 				sock_fdcnt = 0;
 				sock_fd = NULL;
 				nc_handle = setnetconfig();
 				attempt_cnt++;
 			} else if (mallocd_svcport != 0 &&
 			    attempt_cnt == GETPORT_MAXTRY) {
 				/*
 				 * For the last attempt, allow
 				 * different port #s for each nconf
 				 * by saving the svcport_str and
 				 * setting it back to NULL.
 				 */
 				port_list = realloc(port_list,
 				    (port_len + 1) * sizeof(char *));
 				if (port_list == NULL)
 					out_of_mem();
 				port_list[port_len++] = svcport_str;
 				svcport_str = NULL;
 				mallocd_svcport = 0;
 			}
 		  }
 	  }
   }
 
   /*
    * Successfully bound the ports, so call complete_service() to
    * do the rest of the setup on the service(s).
    */
   sock_fdpos = 0;
   port_pos = 0;
   nc_handle = setnetconfig();
   while ((nconf = getnetconfig(nc_handle))) {
 	  /* We want to listen only on udp6, tcp6, udp, tcp transports */
 	  if (nconf->nc_flag & NC_VISIBLE) {
 		  /* Skip if there's no IPv6 support */
 		  if (have_v6 == 0 && strcmp(nconf->nc_protofmly, "inet6") == 0) {
 	      /* DO NOTHING */
 		  } else if (port_list != NULL) {
 			if (port_pos >= port_len) {
 				syslog(LOG_ERR, "too many port#s");
 				exit(1);
 			}
 			complete_service(nconf, port_list[port_pos++]);
 		  } else
 			complete_service(nconf, svcport_str);
 	  }
   }
   endnetconfig(nc_handle);
   free(sock_fd);
   if (port_list != NULL) {
   	for (port_pos = 0; port_pos < port_len; port_pos++)
   		free(port_list[port_pos]);
   	free(port_list);
   }
 
   init_file("/var/db/statd.status");
 
   /* Note that it is NOT sensible to run this program from inetd - the 	*/
   /* protocol assumes that it will run immediately at boot time.	*/
   daemon(0, 0);
   openlog("rpc.statd", 0, LOG_DAEMON);
   if (debug) syslog(LOG_INFO, "Starting - debug enabled");
   else syslog(LOG_INFO, "Starting");
 
   /* Install signal handler to collect exit status of child processes	*/
   sa.sa_handler = handle_sigchld;
   sigemptyset(&sa.sa_mask);
   sigaddset(&sa.sa_mask, SIGCHLD);
   sa.sa_flags = SA_RESTART;
   sigaction(SIGCHLD, &sa, NULL);
 
   /* Initialisation now complete - start operating			*/
   notify_hosts();	/* Forks a process (if necessary) to do the	*/
 			/* SM_NOTIFY calls, which may be slow.		*/
 
   svc_run();	/* Should never return					*/
   exit(1);
 }
 
 /*
  * This routine creates and binds sockets on the appropriate
  * addresses. It gets called one time for each transport.
  * It returns 0 upon success, 1 for ingore the call and -1 to indicate
  * bind failed with EADDRINUSE.
  * Any file descriptors that have been created are stored in sock_fd and
  * the total count of them is maintained in sock_fdcnt.
  */
 static int
 create_service(struct netconfig *nconf)
 {
 	struct addrinfo hints, *res = NULL;
 	struct sockaddr_in *sin;
 	struct sockaddr_in6 *sin6;
 	struct __rpc_sockinfo si;
 	int aicode;
 	int fd;
 	int nhostsbak;
 	int r;
 	u_int32_t host_addr[4];  /* IPv4 or IPv6 */
 	int mallocd_res;
 
 	if ((nconf->nc_semantics != NC_TPI_CLTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS_ORD))
 		return (1);	/* not my type */
 
 	/*
 	 * XXX - using RPC library internal functions.
 	 */
 	if (!__rpc_nconf2sockinfo(nconf, &si)) {
 		syslog(LOG_ERR, "cannot get information for %s",
 		    nconf->nc_netid);
 		return (1);
 	}
 
 	/* Get rpc.statd's address on this transport */
 	memset(&hints, 0, sizeof hints);
 	hints.ai_family = si.si_af;
 	hints.ai_socktype = si.si_socktype;
 	hints.ai_protocol = si.si_proto;
 
 	/*
 	 * Bind to specific IPs if asked to
 	 */
 	nhostsbak = nhosts;
 	while (nhostsbak > 0) {
 		--nhostsbak;
 		sock_fd = realloc(sock_fd, (sock_fdcnt + 1) * sizeof(int));
 		if (sock_fd == NULL)
 			out_of_mem();
 		sock_fd[sock_fdcnt++] = -1;	/* Set invalid for now. */
 		mallocd_res = 0;
 		hints.ai_flags = AI_PASSIVE;
 
 		/*	
 		 * XXX - using RPC library internal functions.
 		 */
 		if ((fd = __rpc_nconf2fd(nconf)) < 0) {
 			syslog(LOG_ERR, "cannot create socket for %s",
 			    nconf->nc_netid);
 			continue;
 		}
 		switch (hints.ai_family) {
 		case AF_INET:
 			if (inet_pton(AF_INET, hosts[nhostsbak],
 			    host_addr) == 1) {
 				hints.ai_flags |= AI_NUMERICHOST;
 			} else {
 				/*
 				 * Skip if we have an AF_INET6 address.
 				 */
 				if (inet_pton(AF_INET6, hosts[nhostsbak],
 				    host_addr) == 1) {
 					close(fd);
 					continue;
 				}
 			}
 			break;
 		case AF_INET6:
 			if (inet_pton(AF_INET6, hosts[nhostsbak],
 			    host_addr) == 1) {
 				hints.ai_flags |= AI_NUMERICHOST;
 			} else {
 				/*
 				 * Skip if we have an AF_INET address.
 				 */
 				if (inet_pton(AF_INET, hosts[nhostsbak],
 				    host_addr) == 1) {
 					close(fd);
 					continue;
 				}
 			}
 			break;
 		default:
 			break;
 		}
 
 		/*
 		 * If no hosts were specified, just bind to INADDR_ANY
 		 */
 		if (strcmp("*", hosts[nhostsbak]) == 0) {
 			if (svcport_str == NULL) {
 				res = malloc(sizeof(struct addrinfo));
 				if (res == NULL) 
 					out_of_mem();
 				mallocd_res = 1;
 				res->ai_flags = hints.ai_flags;
 				res->ai_family = hints.ai_family;
 				res->ai_protocol = hints.ai_protocol;
 				switch (res->ai_family) {
 				case AF_INET:
 					sin = malloc(sizeof(struct sockaddr_in));
 					if (sin == NULL) 
 						out_of_mem();
 					sin->sin_family = AF_INET;
 					sin->sin_port = htons(0);
 					sin->sin_addr.s_addr = htonl(INADDR_ANY);
 					res->ai_addr = (struct sockaddr*) sin;
 					res->ai_addrlen = (socklen_t)
 					    sizeof(struct sockaddr_in);
 					break;
 				case AF_INET6:
 					sin6 = malloc(sizeof(struct sockaddr_in6));
 					if (sin6 == NULL)
 						out_of_mem();
 					sin6->sin6_family = AF_INET6;
 					sin6->sin6_port = htons(0);
 					sin6->sin6_addr = in6addr_any;
 					res->ai_addr = (struct sockaddr*) sin6;
 					res->ai_addrlen = (socklen_t)
 					    sizeof(struct sockaddr_in6);
 					break;
 				default:
 					syslog(LOG_ERR, "bad addr fam %d",
 					    res->ai_family);
 					exit(1);
 				}
 			} else { 
 				if ((aicode = getaddrinfo(NULL, svcport_str,
 				    &hints, &res)) != 0) {
 					syslog(LOG_ERR,
 					    "cannot get local address for %s: %s",
 					    nconf->nc_netid,
 					    gai_strerror(aicode));
 					close(fd);
 					continue;
 				}
 			}
 		} else {
 			if ((aicode = getaddrinfo(hosts[nhostsbak], svcport_str,
 			    &hints, &res)) != 0) {
 				syslog(LOG_ERR,
 				    "cannot get local address for %s: %s",
 				    nconf->nc_netid, gai_strerror(aicode));
 				close(fd);
 				continue;
 			}
 		}
 
 		/* Store the fd. */
 		sock_fd[sock_fdcnt - 1] = fd;
 
 		/* Now, attempt the bind. */
 		r = bindresvport_sa(fd, res->ai_addr);
 		if (r != 0) {
 			if (errno == EADDRINUSE && mallocd_svcport != 0) {
 				if (mallocd_res != 0) {
 					free(res->ai_addr);
 					free(res);
 				} else
 					freeaddrinfo(res);
 				return (-1);
 			}
 			syslog(LOG_ERR, "bindresvport_sa: %m");
 			exit(1);
 		}
 
 		if (svcport_str == NULL) {
 			svcport_str = malloc(NI_MAXSERV * sizeof(char));
 			if (svcport_str == NULL)
 				out_of_mem();
 			mallocd_svcport = 1;
 
 			if (getnameinfo(res->ai_addr,
 			    res->ai_addr->sa_len, NULL, NI_MAXHOST,
 			    svcport_str, NI_MAXSERV * sizeof(char),
 			    NI_NUMERICHOST | NI_NUMERICSERV))
 				errx(1, "Cannot get port number");
 		}
 		if (mallocd_res != 0) {
 			free(res->ai_addr);
 			free(res);
 		} else
 			freeaddrinfo(res);
 		res = NULL;
 	}
 	return (0);
 }
 
 /*
  * Called after all the create_service() calls have succeeded, to complete
  * the setup and registration.
  */
 static void
 complete_service(struct netconfig *nconf, char *port_str)
 {
 	struct addrinfo hints, *res = NULL;
 	struct __rpc_sockinfo si;
 	struct netbuf servaddr;
 	SVCXPRT	*transp = NULL;
 	int aicode, fd, nhostsbak;
 	int registered = 0;
 
 	if ((nconf->nc_semantics != NC_TPI_CLTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS_ORD))
 		return;	/* not my type */
 
 	/*
 	 * XXX - using RPC library internal functions.
 	 */
 	if (!__rpc_nconf2sockinfo(nconf, &si)) {
 		syslog(LOG_ERR, "cannot get information for %s",
 		    nconf->nc_netid);
 		return;
 	}
 
 	nhostsbak = nhosts;
 	while (nhostsbak > 0) {
 		--nhostsbak;
 		if (sock_fdpos >= sock_fdcnt) {
 			/* Should never happen. */
 			syslog(LOG_ERR, "Ran out of socket fd's");
 			return;
 		}
 		fd = sock_fd[sock_fdpos++];
 		if (fd < 0)
 			continue;
 
 		if (nconf->nc_semantics != NC_TPI_CLTS)
 			listen(fd, SOMAXCONN);
 
 		transp = svc_tli_create(fd, nconf, NULL,
 		RPC_MAXDATASIZE, RPC_MAXDATASIZE);
 
 		if (transp != (SVCXPRT *) NULL) {
 			if (!svc_register(transp, SM_PROG, SM_VERS,
 			    sm_prog_1, 0)) {
 				syslog(LOG_ERR, "can't register on %s",
 				    nconf->nc_netid);
 			} else {
 				if (!svc_reg(transp, SM_PROG, SM_VERS,
 				    sm_prog_1, NULL)) 
 					syslog(LOG_ERR,
 					    "can't register %s SM_PROG service",
 					    nconf->nc_netid);
 			}
 		} else 
 			syslog(LOG_WARNING, "can't create %s services",
 			    nconf->nc_netid);
 
 		if (registered == 0) {
 			registered = 1;
 			memset(&hints, 0, sizeof hints);
 			hints.ai_flags = AI_PASSIVE;
 			hints.ai_family = si.si_af;
 			hints.ai_socktype = si.si_socktype;
 			hints.ai_protocol = si.si_proto;
 
 
 			if ((aicode = getaddrinfo(NULL, port_str, &hints,
 			    &res)) != 0) {
 				syslog(LOG_ERR, "cannot get local address: %s",
 				    gai_strerror(aicode));
 				exit(1);
 			}
 
 			servaddr.buf = malloc(res->ai_addrlen);
 			memcpy(servaddr.buf, res->ai_addr, res->ai_addrlen);
 			servaddr.len = res->ai_addrlen;
 
 			rpcb_set(SM_PROG, SM_VERS, nconf, &servaddr);
 
 			xcreated++;
 			freeaddrinfo(res);
 		}
 	} /* end while */
 }
 
 /*
  * Clear out sockets after a failure to bind one of them, so that the
  * cycle of socket creation/binding can start anew.
  */
 static void
 clearout_service(void)
 {
 	int i;
 
 	for (i = 0; i < sock_fdcnt; i++) {
 		if (sock_fd[i] >= 0) {
 			shutdown(sock_fd[i], SHUT_RDWR);
 			close(sock_fd[i]);
 		}
 	}
 }
 
 static void
 usage(void)
 {
       fprintf(stderr, "usage: rpc.statd [-d] [-h <bindip>] [-p <port>]\n");
       exit(1);
 }
 
 /* handle_sigchld ---------------------------------------------------------- */
 /*
    Purpose:	Catch SIGCHLD and collect process status
    Retruns:	Nothing.
    Notes:	No special action required, other than to collect the
 		process status and hence allow the child to die:
 		we only use child processes for asynchronous transmission
 		of SM_NOTIFY to other systems, so it is normal for the
 		children to exit when they have done their work.
 */
 
 static void handle_sigchld(int sig __unused)
 {
   int pid, status;
   pid = wait4(-1, &status, WNOHANG, (struct rusage*)0);
   if (!pid) syslog(LOG_ERR, "Phantom SIGCHLD??");
   else if (status == 0)
   {
     if (debug) syslog(LOG_DEBUG, "Child %d exited OK", pid);
   }
   else syslog(LOG_ERR, "Child %d failed with status %d", pid,
     WEXITSTATUS(status));
 }
 
 /*
  * Out of memory, fatal
  */
 void
 out_of_mem(void)
 {
 
 	syslog(LOG_ERR, "out of memory");
 	exit(2);
 }
Index: projects/clang900-import/usr.sbin/rpc.statd/test.c
===================================================================
--- projects/clang900-import/usr.sbin/rpc.statd/test.c	(revision 352586)
+++ projects/clang900-import/usr.sbin/rpc.statd/test.c	(revision 352587)
@@ -1,144 +1,144 @@
-
 #ifndef lint
 static const char rcsid[] =
   "$FreeBSD$";
 #endif /* not lint */
 
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include <rpc/rpc.h>
 #include <rpcsvc/sm_inter.h>
 
-
 /* Default timeout can be changed using clnt_control() */
 static struct timeval TIMEOUT = { 25, 0 };
 
 struct sm_stat_res *
 sm_stat_1(argp, clnt)
 	struct sm_name *argp;
 	CLIENT *clnt;
 {
 	static struct sm_stat_res res;
 
 	bzero((char *)&res, sizeof(res));
-	if (clnt_call(clnt, SM_STAT, xdr_sm_name, argp, xdr_sm_stat_res, &res, TIMEOUT) != RPC_SUCCESS) {
+	if (clnt_call(clnt, SM_STAT, (xdrproc_t)xdr_sm_name, argp,
+	    (xdrproc_t)xdr_sm_stat_res, &res, TIMEOUT) != RPC_SUCCESS) {
 		return (NULL);
 	}
 	return (&res);
 }
 
 
 struct sm_stat_res *
 sm_mon_1(argp, clnt)
 	struct mon *argp;
 	CLIENT *clnt;
 {
 	static struct sm_stat_res res;
 
 	bzero((char *)&res, sizeof(res));
-	if (clnt_call(clnt, SM_MON, xdr_mon, argp, xdr_sm_stat_res, &res, TIMEOUT) != RPC_SUCCESS) {
+	if (clnt_call(clnt, SM_MON, (xdrproc_t)xdr_mon, argp,
+	    (xdrproc_t)xdr_sm_stat_res, &res, TIMEOUT) != RPC_SUCCESS) {
 		return (NULL);
 	}
 	return (&res);
 }
 
 
 struct sm_stat *
 sm_unmon_1(argp, clnt)
 	struct mon_id *argp;
 	CLIENT *clnt;
 {
 	static struct sm_stat res;
 
 	bzero((char *)&res, sizeof(res));
-	if (clnt_call(clnt, SM_UNMON, xdr_mon_id, argp, xdr_sm_stat, &res, TIMEOUT) != RPC_SUCCESS) {
+	if (clnt_call(clnt, SM_UNMON, (xdrproc_t)xdr_mon_id, argp,
+	    (xdrproc_t)xdr_sm_stat, &res, TIMEOUT) != RPC_SUCCESS) {
 		return (NULL);
 	}
 	return (&res);
 }
 
 
 struct sm_stat *
 sm_unmon_all_1(argp, clnt)
 	struct my_id *argp;
 	CLIENT *clnt;
 {
 	static struct sm_stat res;
 
 	bzero((char *)&res, sizeof(res));
-	if (clnt_call(clnt, SM_UNMON_ALL, xdr_my_id, argp, xdr_sm_stat, &res, TIMEOUT) != RPC_SUCCESS) {
+	if (clnt_call(clnt, SM_UNMON_ALL, (xdrproc_t)xdr_my_id, argp,
+	    (xdrproc_t)xdr_sm_stat, &res, TIMEOUT) != RPC_SUCCESS) {
 		return (NULL);
 	}
 	return (&res);
 }
 
 
 void *
 sm_simu_crash_1(argp, clnt)
 	void *argp;
 	CLIENT *clnt;
 {
 	static char res;
 
 	bzero((char *)&res, sizeof(res));
-	if (clnt_call(clnt, SM_SIMU_CRASH, xdr_void, argp, xdr_void, &res, TIMEOUT) != RPC_SUCCESS) {
+	if (clnt_call(clnt, SM_SIMU_CRASH, (xdrproc_t)xdr_void, argp,
+	    (xdrproc_t)xdr_void, &res, TIMEOUT) != RPC_SUCCESS) {
 		return (NULL);
 	}
 	return ((void *)&res);
 }
 
 
 int main(int argc, char **argv)
 {
   CLIENT *cli;
   char dummy;
   void *out;
   struct mon mon;
 
   if (argc < 2)
   {
     fprintf(stderr, "usage: test <hostname> | crash\n");
     fprintf(stderr, "always talks to statd at localhost\n");
     exit(1);
   }
 
   printf("Creating client for localhost\n" );
   cli = clnt_create("localhost", SM_PROG, SM_VERS, "udp");
   if (!cli)
   {
     printf("Failed to create client\n");
     exit(1);
   }
 
   mon.mon_id.mon_name = argv[1];
   mon.mon_id.my_id.my_name = argv[1];
   mon.mon_id.my_id.my_prog = SM_PROG;
   mon.mon_id.my_id.my_vers = SM_VERS;
   mon.mon_id.my_id.my_proc = 1;	/* have it call sm_stat() !!!	*/
 
   if (strcmp(argv[1], "crash"))
   {
     /* Hostname given		*/
     struct sm_stat_res *res;
-    if (res = sm_mon_1(&mon, cli))
-    {
+
+    res = sm_mon_1(&mon, cli);
+    if (res)
       printf("Success!\n");
-    }
     else
-    {
       printf("Fail\n");  
-    }
   }
   else
   {
-    if (out = sm_simu_crash_1(&dummy, cli))
-    {
+    out = sm_simu_crash_1(&dummy, cli);
+    if (out)
       printf("Success!\n");
-    }
     else
-    {
       printf("Fail\n");  
-    }
   }
 
   return 0;
 }
Index: projects/clang900-import
===================================================================
--- projects/clang900-import	(revision 352586)
+++ projects/clang900-import	(revision 352587)

Property changes on: projects/clang900-import
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r352537-352586