Index: head/lib/libmemstat/memstat.c
===================================================================
--- head/lib/libmemstat/memstat.c	(revision 350658)
+++ head/lib/libmemstat/memstat.c	(revision 350659)
@@ -1,443 +1,450 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 
 #include <err.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "memstat.h"
 #include "memstat_internal.h"
 
 const char *
 memstat_strerror(int error)
 {
 
 	switch (error) {
 	case MEMSTAT_ERROR_NOMEMORY:
 		return ("Cannot allocate memory");
 	case MEMSTAT_ERROR_VERSION:
 		return ("Version mismatch");
 	case MEMSTAT_ERROR_PERMISSION:
 		return ("Permission denied");
 	case MEMSTAT_ERROR_DATAERROR:
 		return ("Data format error");
 	case MEMSTAT_ERROR_KVM:
 		return ("KVM error");
 	case MEMSTAT_ERROR_KVM_NOSYMBOL:
 		return ("KVM unable to find symbol");
 	case MEMSTAT_ERROR_KVM_SHORTREAD:
 		return ("KVM short read");
 	case MEMSTAT_ERROR_UNDEFINED:
 	default:
 		return ("Unknown error");
 	}
 }
 
 struct memory_type_list *
 memstat_mtl_alloc(void)
 {
 	struct memory_type_list *mtlp;
 
 	mtlp = malloc(sizeof(*mtlp));
 	if (mtlp == NULL)
 		return (NULL);
 
 	LIST_INIT(&mtlp->mtl_list);
 	mtlp->mtl_error = MEMSTAT_ERROR_UNDEFINED;
 	return (mtlp);
 }
 
 struct memory_type *
 memstat_mtl_first(struct memory_type_list *list)
 {
 
 	return (LIST_FIRST(&list->mtl_list));
 }
 
 struct memory_type *
 memstat_mtl_next(struct memory_type *mtp)
 {
 
 	return (LIST_NEXT(mtp, mt_list));
 }
 
 void
 _memstat_mtl_empty(struct memory_type_list *list)
 {
 	struct memory_type *mtp;
 
 	while ((mtp = LIST_FIRST(&list->mtl_list))) {
 		free(mtp->mt_percpu_alloc);
 		free(mtp->mt_percpu_cache);
 		LIST_REMOVE(mtp, mt_list);
 		free(mtp);
 	}
 }
 
 void
 memstat_mtl_free(struct memory_type_list *list)
 {
 
 	_memstat_mtl_empty(list);
 	free(list);
 }
 
 int
 memstat_mtl_geterror(struct memory_type_list *list)
 {
 
 	return (list->mtl_error);
 }
 
 /*
  * Look for an existing memory_type entry in a memory_type list, based on the
  * allocator and name of the type.  If not found, return NULL.  No errno or
  * memstat error.
  */
 struct memory_type *
 memstat_mtl_find(struct memory_type_list *list, int allocator,
     const char *name)
 {
 	struct memory_type *mtp;
 
 	LIST_FOREACH(mtp, &list->mtl_list, mt_list) {
 		if ((mtp->mt_allocator == allocator ||
 		    allocator == ALLOCATOR_ANY) &&
 		    strcmp(mtp->mt_name, name) == 0)
 			return (mtp);
 	}
 	return (NULL);
 }
 
 /*
  * Allocate a new memory_type with the specificed allocator type and name,
  * then insert into the list.  The structure will be zero'd.
  *
  * libmemstat(3) internal function.
  */
 struct memory_type *
 _memstat_mt_allocate(struct memory_type_list *list, int allocator,
     const char *name, int maxcpus)
 {
 	struct memory_type *mtp;
 
 	mtp = malloc(sizeof(*mtp));
 	if (mtp == NULL)
 		return (NULL);
 
 	bzero(mtp, sizeof(*mtp));
 
 	mtp->mt_allocator = allocator;
 	mtp->mt_percpu_alloc = malloc(sizeof(struct mt_percpu_alloc_s) *
 	    maxcpus);
 	mtp->mt_percpu_cache = malloc(sizeof(struct mt_percpu_cache_s) *
 	    maxcpus);
 	strlcpy(mtp->mt_name, name, MEMTYPE_MAXNAME);
 	LIST_INSERT_HEAD(&list->mtl_list, mtp, mt_list);
 	return (mtp);
 }
 
 /*
  * Reset any libmemstat(3)-owned statistics in a memory_type record so that
  * it can be reused without incremental addition problems.  Caller-owned
  * memory is left "as-is", and must be updated by the caller if desired.
  *
  * libmemstat(3) internal function.
  */
 void
 _memstat_mt_reset_stats(struct memory_type *mtp, int maxcpus)
 {
 	int i;
 
 	mtp->mt_countlimit = 0;
 	mtp->mt_byteslimit = 0;
 	mtp->mt_sizemask = 0;
 	mtp->mt_size = 0;
 
 	mtp->mt_memalloced = 0;
 	mtp->mt_memfreed = 0;
 	mtp->mt_numallocs = 0;
 	mtp->mt_numfrees = 0;
 	mtp->mt_bytes = 0;
 	mtp->mt_count = 0;
 	mtp->mt_free = 0;
 	mtp->mt_failures = 0;
 	mtp->mt_sleeps = 0;
 
 	mtp->mt_zonefree = 0;
 	mtp->mt_kegfree = 0;
 
 	for (i = 0; i < maxcpus; i++) {
 		mtp->mt_percpu_alloc[i].mtp_memalloced = 0;
 		mtp->mt_percpu_alloc[i].mtp_memfreed = 0;
 		mtp->mt_percpu_alloc[i].mtp_numallocs = 0;
 		mtp->mt_percpu_alloc[i].mtp_numfrees = 0;
 		mtp->mt_percpu_alloc[i].mtp_sizemask = 0;
 		mtp->mt_percpu_cache[i].mtp_free = 0;
 	}
 }
 
 /*
  * Accessor methods for struct memory_type.  Avoids encoding the structure
  * ABI into the application.
  */
 const char *
 memstat_get_name(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_name);
 }
 
 int
 memstat_get_allocator(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_allocator);
 }
 
 uint64_t
 memstat_get_countlimit(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_countlimit);
 }
 
 uint64_t
 memstat_get_byteslimit(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_byteslimit);
 }
 
 uint64_t
 memstat_get_sizemask(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_sizemask);
 }
 
 uint64_t
 memstat_get_size(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_size);
 }
 
 uint64_t
 memstat_get_rsize(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_rsize);
 }
 
 uint64_t
 memstat_get_memalloced(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_memalloced);
 }
 
 uint64_t
 memstat_get_memfreed(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_memfreed);
 }
 
 uint64_t
 memstat_get_numallocs(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_numallocs);
 }
 
 uint64_t
 memstat_get_numfrees(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_numfrees);
 }
 
 uint64_t
 memstat_get_bytes(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_bytes);
 }
 
 uint64_t
 memstat_get_count(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_count);
 }
 
 uint64_t
 memstat_get_free(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_free);
 }
 
 uint64_t
 memstat_get_failures(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_failures);
 }
 
 uint64_t
 memstat_get_sleeps(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_sleeps);
 }
 
+uint64_t
+memstat_get_xdomain(const struct memory_type *mtp)
+{
+
+	return (mtp->mt_xdomain);
+}
+
 void *
 memstat_get_caller_pointer(const struct memory_type *mtp, int index)
 {
 
 	return (mtp->mt_caller_pointer[index]);
 }
 
 void
 memstat_set_caller_pointer(struct memory_type *mtp, int index, void *value)
 {
 
 	mtp->mt_caller_pointer[index] = value;
 }
 
 uint64_t
 memstat_get_caller_uint64(const struct memory_type *mtp, int index)
 {
 
 	return (mtp->mt_caller_uint64[index]);
 }
 
 void
 memstat_set_caller_uint64(struct memory_type *mtp, int index, uint64_t value)
 {
 
 	mtp->mt_caller_uint64[index] = value;
 }
 
 uint64_t
 memstat_get_zonefree(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_zonefree);
 }
 
 uint64_t
 memstat_get_kegfree(const struct memory_type *mtp)
 {
 
 	return (mtp->mt_kegfree);
 }
 
 uint64_t
 memstat_get_percpu_memalloced(const struct memory_type *mtp, int cpu)
 {
 
 	return (mtp->mt_percpu_alloc[cpu].mtp_memalloced);
 }
 
 uint64_t
 memstat_get_percpu_memfreed(const struct memory_type *mtp, int cpu)
 {
 
 	return (mtp->mt_percpu_alloc[cpu].mtp_memfreed);
 }
 
 uint64_t
 memstat_get_percpu_numallocs(const struct memory_type *mtp, int cpu)
 {
 
 	return (mtp->mt_percpu_alloc[cpu].mtp_numallocs);
 }
 
 uint64_t
 memstat_get_percpu_numfrees(const struct memory_type *mtp, int cpu)
 {
 
 	return (mtp->mt_percpu_alloc[cpu].mtp_numfrees);
 }
 
 uint64_t
 memstat_get_percpu_sizemask(const struct memory_type *mtp, int cpu)
 {
 
 	return (mtp->mt_percpu_alloc[cpu].mtp_sizemask);
 }
 
 void *
 memstat_get_percpu_caller_pointer(const struct memory_type *mtp, int cpu,
     int index)
 {
 
 	return (mtp->mt_percpu_alloc[cpu].mtp_caller_pointer[index]);
 }
 
 void
 memstat_set_percpu_caller_pointer(struct memory_type *mtp, int cpu,
     int index, void *value)
 {
 
 	mtp->mt_percpu_alloc[cpu].mtp_caller_pointer[index] = value;
 }
 
 uint64_t
 memstat_get_percpu_caller_uint64(const struct memory_type *mtp, int cpu,
     int index)
 {
 
 	return (mtp->mt_percpu_alloc[cpu].mtp_caller_uint64[index]);
 }
 
 void
 memstat_set_percpu_caller_uint64(struct memory_type *mtp, int cpu, int index,
     uint64_t value)
 {
 
 	mtp->mt_percpu_alloc[cpu].mtp_caller_uint64[index] = value;
 }
 
 uint64_t
 memstat_get_percpu_free(const struct memory_type *mtp, int cpu)
 {
 
 	return (mtp->mt_percpu_cache[cpu].mtp_free);
 }
Index: head/lib/libmemstat/memstat.h
===================================================================
--- head/lib/libmemstat/memstat.h	(revision 350658)
+++ head/lib/libmemstat/memstat.h	(revision 350659)
@@ -1,171 +1,172 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MEMSTAT_H_
 #define	_MEMSTAT_H_
 
 /*
  * Amount of caller data to maintain for each caller data slot.  Applications
  * must not request more than this number of caller save data, or risk
  * corrupting internal libmemstat(3) data structures.  A compile time check
  * in the application is probably appropriate.
  */
 #define	MEMSTAT_MAXCALLER	16
 
 /*
  * libmemstat(3) is able to extract memory data from different allocators;
  * when it does so, it tags which allocator it got the data from so that
  * consumers can determine which fields are usable, as data returned varies
  * some.
  */
 #define	ALLOCATOR_UNKNOWN	0
 #define	ALLOCATOR_MALLOC	1
 #define	ALLOCATOR_UMA		2
 #define	ALLOCATOR_ANY		255
 
 /*
  * Library maximum type name.  Should be max(set of name maximums over
  * various allocators).
  */
 #define	MEMTYPE_MAXNAME		32
 
 /*
  * Library error conditions, mostly from the underlying data sources.  On
  * failure, functions typically return (-1) or (NULL); on success, (0) or a
  * valid data pointer.  The error from the last operation is stored in
  * struct memory_type_list, and accessed via memstat_get_error(list).
  */
 #define	MEMSTAT_ERROR_UNDEFINED		0	/* Initialization value. */
 #define	MEMSTAT_ERROR_NOMEMORY		1	/* Out of memory. */
 #define	MEMSTAT_ERROR_VERSION		2	/* Unsupported version. */
 #define	MEMSTAT_ERROR_PERMISSION	3	/* Permission denied. */
 #define	MEMSTAT_ERROR_DATAERROR		5	/* Error in stat data. */
 #define	MEMSTAT_ERROR_KVM		6	/* See kvm_geterr() for err. */
 #define	MEMSTAT_ERROR_KVM_NOSYMBOL	7	/* Symbol not available. */
 #define	MEMSTAT_ERROR_KVM_SHORTREAD	8	/* Short kvm_read return. */
 
 /*
  * Forward declare struct memory_type, which holds per-type properties and
  * statistics.  This is an opaque type, to be frobbed only from within the
  * library, in order to avoid building ABI assumptions into the application.
  * Accessor methods should be used to get and sometimes set the fields from
  * consumers of the library.
  */
 struct memory_type;
 
 /*
  * struct memory_type_list is the head of a list of memory types and
  * statistics.
  */
 struct memory_type_list;
 
 __BEGIN_DECLS
 /*
  * Functions that operate without memory type or memory type list context.
  */
 const char	*memstat_strerror(int error);
 
 /*
  * Functions for managing memory type and statistics data.
  */
 struct memory_type_list	*memstat_mtl_alloc(void);
 struct memory_type	*memstat_mtl_first(struct memory_type_list *list);
 struct memory_type	*memstat_mtl_next(struct memory_type *mtp);
 struct memory_type	*memstat_mtl_find(struct memory_type_list *list,
 			    int allocator, const char *name);
 void	memstat_mtl_free(struct memory_type_list *list);
 int	memstat_mtl_geterror(struct memory_type_list *list);
 
 /*
  * Functions to retrieve data from a live kernel using sysctl.
  */
 int	memstat_sysctl_all(struct memory_type_list *list, int flags);
 int	memstat_sysctl_malloc(struct memory_type_list *list, int flags);
 int	memstat_sysctl_uma(struct memory_type_list *list, int flags);
 
 /*
  * Functions to retrieve data from a kernel core (or /dev/kmem).
  */
 int	memstat_kvm_all(struct memory_type_list *list, void *kvm_handle);
 int	memstat_kvm_malloc(struct memory_type_list *list, void *kvm_handle);
 int	memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle);
 
 /*
  * Accessor methods for struct memory_type.
  */
 const char	*memstat_get_name(const struct memory_type *mtp);
 int		 memstat_get_allocator(const struct memory_type *mtp);
 uint64_t	 memstat_get_countlimit(const struct memory_type *mtp);
 uint64_t	 memstat_get_byteslimit(const struct memory_type *mtp);
 uint64_t	 memstat_get_sizemask(const struct memory_type *mtp);
 uint64_t	 memstat_get_size(const struct memory_type *mtp);
 uint64_t	 memstat_get_rsize(const struct memory_type *mtp);
 uint64_t	 memstat_get_memalloced(const struct memory_type *mtp);
 uint64_t	 memstat_get_memfreed(const struct memory_type *mtp);
 uint64_t	 memstat_get_numallocs(const struct memory_type *mtp);
 uint64_t	 memstat_get_numfrees(const struct memory_type *mtp);
 uint64_t	 memstat_get_bytes(const struct memory_type *mtp);
 uint64_t	 memstat_get_count(const struct memory_type *mtp);
 uint64_t	 memstat_get_free(const struct memory_type *mtp);
 uint64_t	 memstat_get_failures(const struct memory_type *mtp);
 uint64_t	 memstat_get_sleeps(const struct memory_type *mtp);
+uint64_t	 memstat_get_xdomain(const struct memory_type *mtp);
 void		*memstat_get_caller_pointer(const struct memory_type *mtp,
 		    int index);
 void		 memstat_set_caller_pointer(struct memory_type *mtp,
 		    int index, void *value);
 uint64_t	 memstat_get_caller_uint64(const struct memory_type *mtp,
 		    int index);
 void		 memstat_set_caller_uint64(struct memory_type *mtp, int index,
 		    uint64_t value);
 uint64_t	 memstat_get_zonefree(const struct memory_type *mtp);
 uint64_t	 memstat_get_kegfree(const struct memory_type *mtp);
 uint64_t	 memstat_get_percpu_memalloced(const struct memory_type *mtp,
 		    int cpu);
 uint64_t	 memstat_get_percpu_memfreed(const struct memory_type *mtp,
 		    int cpu);
 uint64_t	 memstat_get_percpu_numallocs(const struct memory_type *mtp,
 		    int cpu);
 uint64_t	 memstat_get_percpu_numfrees(const struct memory_type *mtp,
 		    int cpu);
 uint64_t	 memstat_get_percpu_sizemask(const struct memory_type *mtp,
 		    int cpu);
 void		*memstat_get_percpu_caller_pointer(
 		    const struct memory_type *mtp, int cpu, int index);
 void		 memstat_set_percpu_caller_pointer(struct memory_type *mtp,
 		    int cpu, int index, void *value);
 uint64_t	 memstat_get_percpu_caller_uint64(
 		    const struct memory_type *mtp, int cpu, int index);
 void		 memstat_set_percpu_caller_uint64(struct memory_type *mtp,
 		    int cpu, int index, uint64_t value);
 uint64_t	 memstat_get_percpu_free(const struct memory_type *mtp,
 		    int cpu);
 __END_DECLS
 
 #endif /* !_MEMSTAT_H_ */
Index: head/lib/libmemstat/memstat_internal.h
===================================================================
--- head/lib/libmemstat/memstat_internal.h	(revision 350658)
+++ head/lib/libmemstat/memstat_internal.h	(revision 350659)
@@ -1,129 +1,130 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MEMSTAT_INTERNAL_H_
 #define	_MEMSTAT_INTERNAL_H_
 
 /*
  * memstat maintains its own internal notion of statistics on each memory
  * type, common across UMA and kernel malloc.  Some fields are straight from
  * the allocator statistics, others are derived when extracted from the
  * kernel.  A struct memory_type will describe each type supported by an
  * allocator.  memory_type structures can be chained into lists.
  */
 struct memory_type {
 	/*
 	 * Static properties of type.
 	 */
 	int	 mt_allocator;		/* malloc(9), uma(9), etc. */
 	char	 mt_name[MEMTYPE_MAXNAME];	/* name of memory type. */
 
 	/*
 	 * (Relatively) static zone settings, that don't uniquely identify
 	 * the zone, but also don't change much.
 	 */
 	uint64_t	 mt_countlimit;	/* 0, or maximum allocations. */
 	uint64_t	 mt_byteslimit;	/* 0, or maximum bytes. */
 	uint64_t	 mt_sizemask;	/* malloc: allocated size bitmask. */
 	uint64_t	 mt_size;	/* uma: size of objects. */
 	uint64_t	 mt_rsize;	/* uma: real size of objects. */
 
 	/*
 	 * Zone or type information that includes all caches and any central
 	 * zone state.  Depending on the allocator, this may be synthesized
 	 * from several sources, or directly measured.
 	 */
 	uint64_t	 mt_memalloced;	/* Bytes allocated over life time. */
 	uint64_t	 mt_memfreed;	/* Bytes freed over life time. */
 	uint64_t	 mt_numallocs;	/* Allocations over life time. */
 	uint64_t	 mt_numfrees;	/* Frees over life time. */
 	uint64_t	 mt_bytes;	/* Bytes currently allocated. */
 	uint64_t	 mt_count;	/* Number of current allocations. */
 	uint64_t	 mt_free;	/* Number of cached free items. */
 	uint64_t	 mt_failures;	/* Number of allocation failures. */
 	uint64_t	 mt_sleeps;	/* Number of allocation sleeps. */
+	uint64_t	 mt_xdomain;	/* Number of cross domain sleeps. */
 
 	/*
 	 * Caller-owned memory.
 	 */
 	void		*mt_caller_pointer[MEMSTAT_MAXCALLER];	/* Pointers. */
 	uint64_t	 mt_caller_uint64[MEMSTAT_MAXCALLER];	/* Integers. */
 
 	/*
 	 * For allocators making use of per-CPU caches, we also provide raw
 	 * statistics from the central allocator and each per-CPU cache,
 	 * which (combined) sometimes make up the above general statistics.
 	 *
 	 * First, central zone/type state, all numbers excluding any items
 	 * cached in per-CPU caches.
 	 *
 	 * XXXRW: Might be desirable to separately expose allocation stats
 	 * from zone, which should (combined with per-cpu) add up to the
 	 * global stats above.
 	 */
 	uint64_t	 mt_zonefree;	/* Free items in zone. */
 	uint64_t	 mt_kegfree;	/* Free items in keg. */
 
 	/*
 	 * Per-CPU measurements fall into two categories: per-CPU allocation,
 	 * and per-CPU cache state.
 	 */
 	struct mt_percpu_alloc_s {
 		uint64_t	 mtp_memalloced;/* Per-CPU mt_memalloced. */
 		uint64_t	 mtp_memfreed;	/* Per-CPU mt_memfreed. */
 		uint64_t	 mtp_numallocs;	/* Per-CPU mt_numallocs. */
 		uint64_t	 mtp_numfrees;	/* Per-CPU mt_numfrees. */
 		uint64_t	 mtp_sizemask;	/* Per-CPU mt_sizemask. */
 		void		*mtp_caller_pointer[MEMSTAT_MAXCALLER];
 		uint64_t	 mtp_caller_uint64[MEMSTAT_MAXCALLER];
 	}	*mt_percpu_alloc;
 
 	struct mt_percpu_cache_s {
 		uint64_t	 mtp_free;	/* Per-CPU cache free items. */
 	}	*mt_percpu_cache;
 
 	LIST_ENTRY(memory_type)	mt_list;	/* List of types. */
 };
 
 /*
  * Description of struct memory_type_list is in memstat.h.
  */
 struct memory_type_list {
 	LIST_HEAD(, memory_type)	mtl_list;
 	int				mtl_error;
 };
 
 void			 _memstat_mtl_empty(struct memory_type_list *list);
 struct memory_type	*_memstat_mt_allocate(struct memory_type_list *list,
 			    int allocator, const char *name, int maxcpus);
 void			 _memstat_mt_reset_stats(struct memory_type *mtp,
 			    int maxcpus);
 
 #endif /* !_MEMSTAT_INTERNAL_H_ */
Index: head/lib/libmemstat/memstat_uma.c
===================================================================
--- head/lib/libmemstat/memstat_uma.c	(revision 350658)
+++ head/lib/libmemstat/memstat_uma.c	(revision 350659)
@@ -1,494 +1,495 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/counter.h>
 #include <sys/cpuset.h>
 #include <sys/sysctl.h>
 
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <err.h>
 #include <errno.h>
 #include <kvm.h>
 #include <nlist.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "memstat.h"
 #include "memstat_internal.h"
 
 static struct nlist namelist[] = {
 #define	X_UMA_KEGS	0
 	{ .n_name = "_uma_kegs" },
 #define	X_MP_MAXID	1
 	{ .n_name = "_mp_maxid" },
 #define	X_ALL_CPUS	2
 	{ .n_name = "_all_cpus" },
 #define	X_VM_NDOMAINS	3
 	{ .n_name = "_vm_ndomains" },
 	{ .n_name = "" },
 };
 
 /*
  * Extract uma(9) statistics from the running kernel, and store all memory
  * type information in the passed list.  For each type, check the list for an
  * existing entry with the right name/allocator -- if present, update that
  * entry.  Otherwise, add a new entry.  On error, the entire list will be
  * cleared, as entries will be in an inconsistent state.
  *
  * To reduce the level of work for a list that starts empty, we keep around a
  * hint as to whether it was empty when we began, so we can avoid searching
  * the list for entries to update.  Updates are O(n^2) due to searching for
  * each entry before adding it.
  */
 int
 memstat_sysctl_uma(struct memory_type_list *list, int flags)
 {
 	struct uma_stream_header *ushp;
 	struct uma_type_header *uthp;
 	struct uma_percpu_stat *upsp;
 	struct memory_type *mtp;
 	int count, hint_dontsearch, i, j, maxcpus, maxid;
 	char *buffer, *p;
 	size_t size;
 
 	hint_dontsearch = LIST_EMPTY(&list->mtl_list);
 
 	/*
 	 * Query the number of CPUs, number of malloc types so that we can
 	 * guess an initial buffer size.  We loop until we succeed or really
 	 * fail.  Note that the value of maxcpus we query using sysctl is not
 	 * the version we use when processing the real data -- that is read
 	 * from the header.
 	 */
 retry:
 	size = sizeof(maxid);
 	if (sysctlbyname("kern.smp.maxid", &maxid, &size, NULL, 0) < 0) {
 		if (errno == EACCES || errno == EPERM)
 			list->mtl_error = MEMSTAT_ERROR_PERMISSION;
 		else
 			list->mtl_error = MEMSTAT_ERROR_DATAERROR;
 		return (-1);
 	}
 	if (size != sizeof(maxid)) {
 		list->mtl_error = MEMSTAT_ERROR_DATAERROR;
 		return (-1);
 	}
 
 	size = sizeof(count);
 	if (sysctlbyname("vm.zone_count", &count, &size, NULL, 0) < 0) {
 		if (errno == EACCES || errno == EPERM)
 			list->mtl_error = MEMSTAT_ERROR_PERMISSION;
 		else
 			list->mtl_error = MEMSTAT_ERROR_VERSION;
 		return (-1);
 	}
 	if (size != sizeof(count)) {
 		list->mtl_error = MEMSTAT_ERROR_DATAERROR;
 		return (-1);
 	}
 
 	size = sizeof(*uthp) + count * (sizeof(*uthp) + sizeof(*upsp) *
 	    (maxid + 1));
 
 	buffer = malloc(size);
 	if (buffer == NULL) {
 		list->mtl_error = MEMSTAT_ERROR_NOMEMORY;
 		return (-1);
 	}
 
 	if (sysctlbyname("vm.zone_stats", buffer, &size, NULL, 0) < 0) {
 		/*
 		 * XXXRW: ENOMEM is an ambiguous return, we should bound the
 		 * number of loops, perhaps.
 		 */
 		if (errno == ENOMEM) {
 			free(buffer);
 			goto retry;
 		}
 		if (errno == EACCES || errno == EPERM)
 			list->mtl_error = MEMSTAT_ERROR_PERMISSION;
 		else
 			list->mtl_error = MEMSTAT_ERROR_VERSION;
 		free(buffer);
 		return (-1);
 	}
 
 	if (size == 0) {
 		free(buffer);
 		return (0);
 	}
 
 	if (size < sizeof(*ushp)) {
 		list->mtl_error = MEMSTAT_ERROR_VERSION;
 		free(buffer);
 		return (-1);
 	}
 	p = buffer;
 	ushp = (struct uma_stream_header *)p;
 	p += sizeof(*ushp);
 
 	if (ushp->ush_version != UMA_STREAM_VERSION) {
 		list->mtl_error = MEMSTAT_ERROR_VERSION;
 		free(buffer);
 		return (-1);
 	}
 
 	/*
 	 * For the remainder of this function, we are quite trusting about
 	 * the layout of structures and sizes, since we've determined we have
 	 * a matching version and acceptable CPU count.
 	 */
 	maxcpus = ushp->ush_maxcpus;
 	count = ushp->ush_count;
 	for (i = 0; i < count; i++) {
 		uthp = (struct uma_type_header *)p;
 		p += sizeof(*uthp);
 
 		if (hint_dontsearch == 0) {
 			mtp = memstat_mtl_find(list, ALLOCATOR_UMA,
 			    uthp->uth_name);
 		} else
 			mtp = NULL;
 		if (mtp == NULL)
 			mtp = _memstat_mt_allocate(list, ALLOCATOR_UMA,
 			    uthp->uth_name, maxid + 1);
 		if (mtp == NULL) {
 			_memstat_mtl_empty(list);
 			free(buffer);
 			list->mtl_error = MEMSTAT_ERROR_NOMEMORY;
 			return (-1);
 		}
 
 		/*
 		 * Reset the statistics on a current node.
 		 */
 		_memstat_mt_reset_stats(mtp, maxid + 1);
 
 		mtp->mt_numallocs = uthp->uth_allocs;
 		mtp->mt_numfrees = uthp->uth_frees;
 		mtp->mt_failures = uthp->uth_fails;
 		mtp->mt_sleeps = uthp->uth_sleeps;
+		mtp->mt_xdomain = uthp->uth_xdomain;
 
 		for (j = 0; j < maxcpus; j++) {
 			upsp = (struct uma_percpu_stat *)p;
 			p += sizeof(*upsp);
 
 			mtp->mt_percpu_cache[j].mtp_free =
 			    upsp->ups_cache_free;
 			mtp->mt_free += upsp->ups_cache_free;
 			mtp->mt_numallocs += upsp->ups_allocs;
 			mtp->mt_numfrees += upsp->ups_frees;
 		}
 
 		/*
 		 * Values for uth_allocs and uth_frees frees are snap.
 		 * It may happen that kernel reports that number of frees
 		 * is greater than number of allocs. See counter(9) for
 		 * details.
 		 */
 		if (mtp->mt_numallocs < mtp->mt_numfrees)
 			mtp->mt_numallocs = mtp->mt_numfrees;
 
 		mtp->mt_size = uthp->uth_size;
 		mtp->mt_rsize = uthp->uth_rsize;
 		mtp->mt_memalloced = mtp->mt_numallocs * uthp->uth_size;
 		mtp->mt_memfreed = mtp->mt_numfrees * uthp->uth_size;
 		mtp->mt_bytes = mtp->mt_memalloced - mtp->mt_memfreed;
 		mtp->mt_countlimit = uthp->uth_limit;
 		mtp->mt_byteslimit = uthp->uth_limit * uthp->uth_size;
 
 		mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees;
 		mtp->mt_zonefree = uthp->uth_zone_free;
 
 		/*
 		 * UMA secondary zones share a keg with the primary zone.  To
 		 * avoid double-reporting of free items, report keg free
 		 * items only in the primary zone.
 		 */
 		if (!(uthp->uth_zone_flags & UTH_ZONE_SECONDARY)) {
 			mtp->mt_kegfree = uthp->uth_keg_free;
 			mtp->mt_free += mtp->mt_kegfree;
 		}
 		mtp->mt_free += mtp->mt_zonefree;
 	}
 
 	free(buffer);
 
 	return (0);
 }
 
 static int
 kread(kvm_t *kvm, void *kvm_pointer, void *address, size_t size,
     size_t offset)
 {
 	ssize_t ret;
 
 	ret = kvm_read(kvm, (unsigned long)kvm_pointer + offset, address,
 	    size);
 	if (ret < 0)
 		return (MEMSTAT_ERROR_KVM);
 	if ((size_t)ret != size)
 		return (MEMSTAT_ERROR_KVM_SHORTREAD);
 	return (0);
 }
 
 static int
 kread_string(kvm_t *kvm, const void *kvm_pointer, char *buffer, int buflen)
 {
 	ssize_t ret;
 	int i;
 
 	for (i = 0; i < buflen; i++) {
 		ret = kvm_read(kvm, (unsigned long)kvm_pointer + i,
 		    &(buffer[i]), sizeof(char));
 		if (ret < 0)
 			return (MEMSTAT_ERROR_KVM);
 		if ((size_t)ret != sizeof(char))
 			return (MEMSTAT_ERROR_KVM_SHORTREAD);
 		if (buffer[i] == '\0')
 			return (0);
 	}
 	/* Truncate. */
 	buffer[i-1] = '\0';
 	return (0);
 }
 
 static int
 kread_symbol(kvm_t *kvm, int index, void *address, size_t size,
     size_t offset)
 {
 	ssize_t ret;
 
 	ret = kvm_read(kvm, namelist[index].n_value + offset, address, size);
 	if (ret < 0)
 		return (MEMSTAT_ERROR_KVM);
 	if ((size_t)ret != size)
 		return (MEMSTAT_ERROR_KVM_SHORTREAD);
 	return (0);
 }
 
 /*
  * memstat_kvm_uma() is similar to memstat_sysctl_uma(), only it extracts
  * UMA(9) statistics from a kernel core/memory file.
  */
 int
 memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle)
 {
 	LIST_HEAD(, uma_keg) uma_kegs;
 	struct memory_type *mtp;
 	struct uma_zone_domain uzd;
 	struct uma_bucket *ubp, ub;
 	struct uma_cache *ucp, *ucp_array;
 	struct uma_zone *uzp, uz;
 	struct uma_keg *kzp, kz;
 	int hint_dontsearch, i, mp_maxid, ndomains, ret;
 	char name[MEMTYPE_MAXNAME];
 	cpuset_t all_cpus;
 	long cpusetsize;
 	kvm_t *kvm;
 
 	kvm = (kvm_t *)kvm_handle;
 	hint_dontsearch = LIST_EMPTY(&list->mtl_list);
 	if (kvm_nlist(kvm, namelist) != 0) {
 		list->mtl_error = MEMSTAT_ERROR_KVM;
 		return (-1);
 	}
 	if (namelist[X_UMA_KEGS].n_type == 0 ||
 	    namelist[X_UMA_KEGS].n_value == 0) {
 		list->mtl_error = MEMSTAT_ERROR_KVM_NOSYMBOL;
 		return (-1);
 	}
 	ret = kread_symbol(kvm, X_MP_MAXID, &mp_maxid, sizeof(mp_maxid), 0);
 	if (ret != 0) {
 		list->mtl_error = ret;
 		return (-1);
 	}
 	ret = kread_symbol(kvm, X_VM_NDOMAINS, &ndomains,
 	    sizeof(ndomains), 0);
 	if (ret != 0) {
 		list->mtl_error = ret;
 		return (-1);
 	}
 	ret = kread_symbol(kvm, X_UMA_KEGS, &uma_kegs, sizeof(uma_kegs), 0);
 	if (ret != 0) {
 		list->mtl_error = ret;
 		return (-1);
 	}
 	cpusetsize = sysconf(_SC_CPUSET_SIZE);
 	if (cpusetsize == -1 || (u_long)cpusetsize > sizeof(cpuset_t)) {
 		list->mtl_error = MEMSTAT_ERROR_KVM_NOSYMBOL;
 		return (-1);
 	}
 	CPU_ZERO(&all_cpus);
 	ret = kread_symbol(kvm, X_ALL_CPUS, &all_cpus, cpusetsize, 0);
 	if (ret != 0) {
 		list->mtl_error = ret;
 		return (-1);
 	}
 	ucp_array = malloc(sizeof(struct uma_cache) * (mp_maxid + 1));
 	if (ucp_array == NULL) {
 		list->mtl_error = MEMSTAT_ERROR_NOMEMORY;
 		return (-1);
 	}
 	for (kzp = LIST_FIRST(&uma_kegs); kzp != NULL; kzp =
 	    LIST_NEXT(&kz, uk_link)) {
 		ret = kread(kvm, kzp, &kz, sizeof(kz), 0);
 		if (ret != 0) {
 			free(ucp_array);
 			_memstat_mtl_empty(list);
 			list->mtl_error = ret;
 			return (-1);
 		}
 		for (uzp = LIST_FIRST(&kz.uk_zones); uzp != NULL; uzp =
 		    LIST_NEXT(&uz, uz_link)) {
 			ret = kread(kvm, uzp, &uz, sizeof(uz), 0);
 			if (ret != 0) {
 				free(ucp_array);
 				_memstat_mtl_empty(list);
 				list->mtl_error = ret;
 				return (-1);
 			}
 			ret = kread(kvm, uzp, ucp_array,
 			    sizeof(struct uma_cache) * (mp_maxid + 1),
 			    offsetof(struct uma_zone, uz_cpu[0]));
 			if (ret != 0) {
 				free(ucp_array);
 				_memstat_mtl_empty(list);
 				list->mtl_error = ret;
 				return (-1);
 			}
 			ret = kread_string(kvm, uz.uz_name, name,
 			    MEMTYPE_MAXNAME);
 			if (ret != 0) {
 				free(ucp_array);
 				_memstat_mtl_empty(list);
 				list->mtl_error = ret;
 				return (-1);
 			}
 			if (hint_dontsearch == 0) {
 				mtp = memstat_mtl_find(list, ALLOCATOR_UMA,
 				    name);
 			} else
 				mtp = NULL;
 			if (mtp == NULL)
 				mtp = _memstat_mt_allocate(list, ALLOCATOR_UMA,
 				    name, mp_maxid + 1);
 			if (mtp == NULL) {
 				free(ucp_array);
 				_memstat_mtl_empty(list);
 				list->mtl_error = MEMSTAT_ERROR_NOMEMORY;
 				return (-1);
 			}
 			/*
 			 * Reset the statistics on a current node.
 			 */
 			_memstat_mt_reset_stats(mtp, mp_maxid + 1);
 			mtp->mt_numallocs = kvm_counter_u64_fetch(kvm,
 			    (unsigned long )uz.uz_allocs);
 			mtp->mt_numfrees = kvm_counter_u64_fetch(kvm,
 			    (unsigned long )uz.uz_frees);
 			mtp->mt_failures = kvm_counter_u64_fetch(kvm,
 			    (unsigned long )uz.uz_fails);
 			mtp->mt_sleeps = uz.uz_sleeps;
-
 			/* See comment above in memstat_sysctl_uma(). */
 			if (mtp->mt_numallocs < mtp->mt_numfrees)
 				mtp->mt_numallocs = mtp->mt_numfrees;
 
+			mtp->mt_xdomain = uz.uz_xdomain;
 			if (kz.uk_flags & UMA_ZFLAG_INTERNAL)
 				goto skip_percpu;
 			for (i = 0; i < mp_maxid + 1; i++) {
 				if (!CPU_ISSET(i, &all_cpus))
 					continue;
 				ucp = &ucp_array[i];
 				mtp->mt_numallocs += ucp->uc_allocs;
 				mtp->mt_numfrees += ucp->uc_frees;
 
 				if (ucp->uc_allocbucket != NULL) {
 					ret = kread(kvm, ucp->uc_allocbucket,
 					    &ub, sizeof(ub), 0);
 					if (ret != 0) {
 						free(ucp_array);
 						_memstat_mtl_empty(list);
 						list->mtl_error = ret;
 						return (-1);
 					}
 					mtp->mt_free += ub.ub_cnt;
 				}
 				if (ucp->uc_freebucket != NULL) {
 					ret = kread(kvm, ucp->uc_freebucket,
 					    &ub, sizeof(ub), 0);
 					if (ret != 0) {
 						free(ucp_array);
 						_memstat_mtl_empty(list);
 						list->mtl_error = ret;
 						return (-1);
 					}
 					mtp->mt_free += ub.ub_cnt;
 				}
 			}
 skip_percpu:
 			mtp->mt_size = kz.uk_size;
 			mtp->mt_rsize = kz.uk_rsize;
 			mtp->mt_memalloced = mtp->mt_numallocs * mtp->mt_size;
 			mtp->mt_memfreed = mtp->mt_numfrees * mtp->mt_size;
 			mtp->mt_bytes = mtp->mt_memalloced - mtp->mt_memfreed;
 			mtp->mt_countlimit = uz.uz_max_items;
 			mtp->mt_byteslimit = mtp->mt_countlimit * mtp->mt_size;
 			mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees;
 			for (i = 0; i < ndomains; i++) {
 				ret = kread(kvm, &uz.uz_domain[i], &uzd,
 				   sizeof(uzd), 0);
 				for (ubp =
 				    LIST_FIRST(&uzd.uzd_buckets);
 				    ubp != NULL;
 				    ubp = LIST_NEXT(&ub, ub_link)) {
 					ret = kread(kvm, ubp, &ub,
 					   sizeof(ub), 0);
 					mtp->mt_zonefree += ub.ub_cnt;
 				}
 			}
 			if (!((kz.uk_flags & UMA_ZONE_SECONDARY) &&
 			    LIST_FIRST(&kz.uk_zones) != uzp)) {
 				mtp->mt_kegfree = kz.uk_free;
 				mtp->mt_free += mtp->mt_kegfree;
 			}
 			mtp->mt_free += mtp->mt_zonefree;
 		}
 	}
 	free(ucp_array);
 	return (0);
 }
Index: head/sys/conf/options
===================================================================
--- head/sys/conf/options	(revision 350658)
+++ head/sys/conf/options	(revision 350659)
@@ -1,1030 +1,1032 @@
 # $FreeBSD$
 #
 #        On the handling of kernel options
 #
 # All kernel options should be listed in NOTES, with suitable
 # descriptions.  Negative options (options that make some code not
 # compile) should be commented out; LINT (generated from NOTES) should
 # compile as much code as possible.  Try to structure option-using
 # code so that a single option only switch code on, or only switch
 # code off, to make it possible to have a full compile-test.  If
 # necessary, you can check for COMPILING_LINT to get maximum code
 # coverage.
 #
 # All new options shall also be listed in either "conf/options" or
 # "conf/options.<machine>".  Options that affect a single source-file
 # <xxx>.[c|s] should be directed into "opt_<xxx>.h", while options
 # that affect multiple files should either go in "opt_global.h" if
 # this is a kernel-wide option (used just about everywhere), or in
 # "opt_<option-name-in-lower-case>.h" if it affects only some files.
 # Note that the effect of listing only an option without a
 # header-file-name in conf/options (and cousins) is that the last
 # convention is followed.
 #
 # This handling scheme is not yet fully implemented.
 #
 #
 # Format of this file:
 # Option name	filename
 #
 # If filename is missing, the default is
 # opt_<name-of-option-in-lower-case>.h
 
 AAC_DEBUG		opt_aac.h
 AACRAID_DEBUG		opt_aacraid.h
 AHC_ALLOW_MEMIO		opt_aic7xxx.h
 AHC_TMODE_ENABLE	opt_aic7xxx.h
 AHC_DUMP_EEPROM		opt_aic7xxx.h
 AHC_DEBUG		opt_aic7xxx.h
 AHC_DEBUG_OPTS		opt_aic7xxx.h
 AHC_REG_PRETTY_PRINT	opt_aic7xxx.h
 AHD_DEBUG		opt_aic79xx.h
 AHD_DEBUG_OPTS		opt_aic79xx.h
 AHD_TMODE_ENABLE	opt_aic79xx.h	
 AHD_REG_PRETTY_PRINT	opt_aic79xx.h
 
 TWA_DEBUG		opt_twa.h
 
 # Debugging options.
 ALT_BREAK_TO_DEBUGGER	opt_kdb.h
 BREAK_TO_DEBUGGER	opt_kdb.h
 BUF_TRACKING		opt_global.h
 DDB
 DDB_BUFR_SIZE	opt_ddb.h
 DDB_CAPTURE_DEFAULTBUFSIZE	opt_ddb.h
 DDB_CAPTURE_MAXBUFSIZE	opt_ddb.h
 DDB_CTF		opt_ddb.h
 DDB_NUMSYM	opt_ddb.h
 FULL_BUF_TRACKING	opt_global.h
 GDB
 KDB		opt_global.h
 KDB_TRACE	opt_kdb.h
 KDB_UNATTENDED	opt_kdb.h
 KLD_DEBUG	opt_kld.h
 SYSCTL_DEBUG	opt_sysctl.h
 EARLY_PRINTF	opt_global.h
 TEXTDUMP_PREFERRED	opt_ddb.h
 TEXTDUMP_VERBOSE	opt_ddb.h
 NUM_CORE_FILES	opt_global.h
 TSLOG	opt_global.h
 TSLOGSIZE	opt_global.h
 
 # Miscellaneous options.
 ALQ
 ALTERA_SDCARD_FAST_SIM	opt_altera_sdcard.h
 ATSE_CFI_HACK	opt_cfi.h
 AUDIT		opt_global.h
 BOOTHOWTO	opt_global.h
 BOOTVERBOSE	opt_global.h
 CALLOUT_PROFILING
 CAPABILITIES	opt_capsicum.h
 CAPABILITY_MODE	opt_capsicum.h
 COMPAT_43	opt_global.h
 COMPAT_43TTY	opt_global.h
 COMPAT_FREEBSD4	opt_global.h
 COMPAT_FREEBSD5	opt_global.h
 COMPAT_FREEBSD6	opt_global.h
 COMPAT_FREEBSD7	opt_global.h
 COMPAT_FREEBSD9	opt_global.h
 COMPAT_FREEBSD10	opt_global.h
 COMPAT_FREEBSD11	opt_global.h
 COMPAT_FREEBSD12	opt_global.h
 COMPAT_CLOUDABI32	opt_dontuse.h
 COMPAT_CLOUDABI64	opt_dontuse.h
 COMPAT_LINUXKPI	opt_dontuse.h
 _COMPAT_LINUX32	opt_compat.h	# XXX: make sure opt_compat.h exists
 COMPILING_LINT	opt_global.h
 CY_PCI_FASTINTR
 DEADLKRES	opt_watchdog.h
 EXPERIMENTAL	opt_global.h
 EXT_RESOURCES	opt_global.h
 DIRECTIO
 FILEMON		opt_dontuse.h
 FFCLOCK
 FULL_PREEMPTION	opt_sched.h
 GZIO		opt_gzio.h
 IMAGACT_BINMISC		opt_dontuse.h
 IPI_PREEMPTION	opt_sched.h
 GEOM_BDE	opt_geom.h
 GEOM_BSD	opt_geom.h
 GEOM_CACHE	opt_geom.h
 GEOM_CONCAT	opt_geom.h
 GEOM_ELI	opt_geom.h
 GEOM_FOX	opt_geom.h
 GEOM_GATE	opt_geom.h
 GEOM_JOURNAL	opt_geom.h
 GEOM_LABEL	opt_geom.h
 GEOM_LABEL_GPT	opt_geom.h
 GEOM_LINUX_LVM	opt_geom.h
 GEOM_MAP	opt_geom.h
 GEOM_MBR	opt_geom.h
 GEOM_MIRROR	opt_geom.h
 GEOM_MOUNTVER	opt_geom.h
 GEOM_MULTIPATH	opt_geom.h
 GEOM_NOP	opt_geom.h
 GEOM_PART_APM	opt_geom.h
 GEOM_PART_BSD	opt_geom.h
 GEOM_PART_BSD64	opt_geom.h
 GEOM_PART_EBR	opt_geom.h
 GEOM_PART_EBR_COMPAT	opt_geom.h
 GEOM_PART_GPT	opt_geom.h
 GEOM_PART_LDM	opt_geom.h
 GEOM_PART_MBR	opt_geom.h
 GEOM_PART_VTOC8	opt_geom.h
 GEOM_RAID	opt_geom.h
 GEOM_RAID3	opt_geom.h
 GEOM_SHSEC	opt_geom.h
 GEOM_STRIPE	opt_geom.h
 GEOM_SUNLABEL	opt_geom.h
 GEOM_UZIP	opt_geom.h
 GEOM_UZIP_DEBUG	opt_geom.h
 GEOM_VINUM	opt_geom.h
 GEOM_VIRSTOR	opt_geom.h
 GEOM_VOL	opt_geom.h
 GEOM_ZERO	opt_geom.h
 IFLIB		opt_iflib.h
 KDTRACE_HOOKS	opt_global.h
 KDTRACE_FRAME	opt_kdtrace.h
 KN_HASHSIZE	opt_kqueue.h
 KSTACK_MAX_PAGES
 KSTACK_PAGES
 KSTACK_USAGE_PROF
 KTRACE
 KTRACE_REQUEST_POOL	opt_ktrace.h
 LIBICONV
 MAC		opt_global.h
 MAC_BIBA	opt_dontuse.h
 MAC_BSDEXTENDED	opt_dontuse.h
 MAC_IFOFF	opt_dontuse.h
 MAC_LOMAC	opt_dontuse.h
 MAC_MLS		opt_dontuse.h
 MAC_NONE	opt_dontuse.h
 MAC_NTPD	opt_dontuse.h
 MAC_PARTITION	opt_dontuse.h
 MAC_PORTACL	opt_dontuse.h
 MAC_SEEOTHERUIDS	opt_dontuse.h
 MAC_STATIC	opt_mac.h
 MAC_STUB	opt_dontuse.h
 MAC_TEST	opt_dontuse.h
 MAC_VERIEXEC	opt_dontuse.h
 MAC_VERIEXEC_SHA1	opt_dontuse.h
 MAC_VERIEXEC_SHA256	opt_dontuse.h
 MAC_VERIEXEC_SHA384	opt_dontuse.h
 MAC_VERIEXEC_SHA512	opt_dontuse.h
 MD_ROOT		opt_md.h
 MD_ROOT_FSTYPE	opt_md.h
 MD_ROOT_READONLY	opt_md.h
 MD_ROOT_SIZE	opt_md.h
 MD_ROOT_MEM	opt_md.h
 MFI_DEBUG	opt_mfi.h
 MFI_DECODE_LOG	opt_mfi.h
 MPROF_BUFFERS	opt_mprof.h
 MPROF_HASH_SIZE	opt_mprof.h
 NEW_PCIB	opt_global.h
 NO_ADAPTIVE_MUTEXES	opt_adaptive_mutexes.h
 NO_ADAPTIVE_RWLOCKS
 NO_ADAPTIVE_SX
 NO_EVENTTIMERS		opt_timer.h
 NO_OBSOLETE_CODE	opt_global.h
 NO_SYSCTL_DESCR	opt_global.h
 NSWBUF_MIN	opt_param.h
 MBUF_PACKET_ZONE_DISABLE	opt_global.h
 PANIC_REBOOT_WAIT_TIME	opt_panic.h
 PCI_HP		opt_pci.h
 PCI_IOV		opt_global.h
 PPC_DEBUG	opt_ppc.h
 PPC_PROBE_CHIPSET	opt_ppc.h
 PPS_SYNC	opt_ntp.h
 PREEMPTION	opt_sched.h
 QUOTA
 SCHED_4BSD	opt_sched.h
 SCHED_STATS	opt_sched.h
 SCHED_ULE	opt_sched.h
 SLEEPQUEUE_PROFILING
 SLHCI_DEBUG	opt_slhci.h
 STACK		opt_stack.h
 SUIDDIR
 MSGMNB		opt_sysvipc.h
 MSGMNI		opt_sysvipc.h
 MSGSEG		opt_sysvipc.h
 MSGSSZ		opt_sysvipc.h
 MSGTQL		opt_sysvipc.h
 SEMMNI		opt_sysvipc.h
 SEMMNS		opt_sysvipc.h
 SEMMNU		opt_sysvipc.h
 SEMMSL		opt_sysvipc.h
 SEMOPM		opt_sysvipc.h
 SEMUME		opt_sysvipc.h
 SHMALL		opt_sysvipc.h
 SHMMAX		opt_sysvipc.h
 SHMMAXPGS	opt_sysvipc.h
 SHMMIN		opt_sysvipc.h
 SHMMNI		opt_sysvipc.h
 SHMSEG		opt_sysvipc.h
 SYSVMSG		opt_sysvipc.h
 SYSVSEM		opt_sysvipc.h
 SYSVSHM		opt_sysvipc.h
 SW_WATCHDOG	opt_watchdog.h
 TCPHPTS         opt_inet.h
 TURNSTILE_PROFILING
 UMTX_PROFILING
 UMTX_CHAINS	opt_global.h
 VERBOSE_SYSINIT
 ZSTDIO		opt_zstdio.h
 
 # Sanitizers
 COVERAGE	opt_global.h
 KCOV
 KUBSAN		opt_global.h
 
 # POSIX kernel options
 P1003_1B_MQUEUE			opt_posix.h
 P1003_1B_SEMAPHORES		opt_posix.h
 _KPOSIX_PRIORITY_SCHEDULING	opt_posix.h
 
 # Do we want the config file compiled into the kernel?
 INCLUDE_CONFIG_FILE	opt_config.h
 
 # Options for static filesystems.  These should only be used at config
 # time, since the corresponding lkms cannot work if there are any static
 # dependencies.  Unusability is enforced by hiding the defines for the
 # options in a never-included header.
 AUTOFS		opt_dontuse.h
 CD9660		opt_dontuse.h
 EXT2FS		opt_dontuse.h
 FDESCFS		opt_dontuse.h
 FFS		opt_dontuse.h
 FUSEFS		opt_dontuse.h
 MSDOSFS		opt_dontuse.h
 NULLFS		opt_dontuse.h
 PROCFS		opt_dontuse.h
 PSEUDOFS	opt_dontuse.h
 SMBFS		opt_dontuse.h
 TMPFS		opt_dontuse.h
 UDF		opt_dontuse.h
 UNIONFS		opt_dontuse.h
 ZFS		opt_dontuse.h
 
 # Pseudofs debugging
 PSEUDOFS_TRACE	opt_pseudofs.h
 
 # In-kernel GSS-API
 KGSSAPI		opt_kgssapi.h
 KGSSAPI_DEBUG	opt_kgssapi.h
 
 # These static filesystems have one slightly bogus static dependency in
 # sys/i386/i386/autoconf.c.  If any of these filesystems are
 # statically compiled into the kernel, code for mounting them as root
 # filesystems will be enabled - but look below.
 # NFSCL - client
 # NFSD - server
 NFSCL		opt_nfs.h
 NFSD		opt_nfs.h
 
 # filesystems and libiconv bridge
 CD9660_ICONV	opt_dontuse.h
 MSDOSFS_ICONV	opt_dontuse.h
 UDF_ICONV	opt_dontuse.h
 
 # If you are following the conditions in the copyright,
 # you can enable soft-updates which will speed up a lot of thigs
 # and make the system safer from crashes at the same time.
 # otherwise a STUB module will be compiled in.
 SOFTUPDATES	opt_ffs.h
 
 # On small, embedded systems, it can be useful to turn off support for
 # snapshots.  It saves about 30-40k for a feature that would be lightly
 # used, if it is used at all.
 NO_FFS_SNAPSHOT	opt_ffs.h
 
 # Enabling this option turns on support for Access Control Lists in UFS,
 # which can be used to support high security configurations.  Depends on
 # UFS_EXTATTR.
 UFS_ACL		opt_ufs.h
 
 # Enabling this option turns on support for extended attributes in UFS-based
 # filesystems, which can be used to support high security configurations
 # as well as new filesystem features.
 UFS_EXTATTR	opt_ufs.h
 UFS_EXTATTR_AUTOSTART	opt_ufs.h
 
 # Enable fast hash lookups for large directories on UFS-based filesystems.
 UFS_DIRHASH	opt_ufs.h
 
 # Enable gjournal-based UFS journal.
 UFS_GJOURNAL	opt_ufs.h
 
 # The below sentence is not in English, and neither is this one.
 # We plan to remove the static dependences above, with a
 # <filesystem>_ROOT option to control if it usable as root.  This list
 # allows these options to be present in config files already (though
 # they won't make any difference yet).
 NFS_ROOT	opt_nfsroot.h
 
 # SMB/CIFS requester
 NETSMB		opt_netsmb.h
 
 # Enable netdump(4) client support.
 NETDUMP 	opt_global.h
 
 # Options used only in subr_param.c.
 HZ		opt_param.h
 MAXFILES	opt_param.h
 NBUF		opt_param.h
 NSFBUFS		opt_param.h
 VM_BCACHE_SIZE_MAX	opt_param.h
 VM_SWZONE_SIZE_MAX	opt_param.h
 MAXUSERS
 DFLDSIZ		opt_param.h
 MAXDSIZ		opt_param.h
 MAXSSIZ		opt_param.h
 
 # Generic SCSI options.
 CAM_MAX_HIGHPOWER	opt_cam.h
 CAMDEBUG		opt_cam.h
 CAM_DEBUG_COMPILE	opt_cam.h
 CAM_DEBUG_DELAY		opt_cam.h
 CAM_DEBUG_BUS		opt_cam.h
 CAM_DEBUG_TARGET	opt_cam.h
 CAM_DEBUG_LUN		opt_cam.h
 CAM_DEBUG_FLAGS		opt_cam.h
 CAM_BOOT_DELAY		opt_cam.h
 CAM_IOSCHED_DYNAMIC	opt_cam.h
 CAM_TEST_FAILURE	opt_cam.h
 SCSI_DELAY		opt_scsi.h
 SCSI_NO_SENSE_STRINGS	opt_scsi.h
 SCSI_NO_OP_STRINGS	opt_scsi.h
 
 # Options used only in cam/ata/ata_da.c
 ATA_STATIC_ID		opt_ada.h
 
 # Options used only in cam/scsi/scsi_cd.c
 CHANGER_MIN_BUSY_SECONDS	opt_cd.h
 CHANGER_MAX_BUSY_SECONDS	opt_cd.h
 
 # Options used only in cam/scsi/scsi_da.c
 DA_TRACK_REFS		opt_da.h
 
 # Options used only in cam/scsi/scsi_sa.c.
 SA_IO_TIMEOUT		opt_sa.h
 SA_SPACE_TIMEOUT	opt_sa.h
 SA_REWIND_TIMEOUT	opt_sa.h
 SA_ERASE_TIMEOUT	opt_sa.h
 SA_1FM_AT_EOD		opt_sa.h
 
 # Options used only in cam/scsi/scsi_pt.c
 SCSI_PT_DEFAULT_TIMEOUT	opt_pt.h
 
 # Options used only in cam/scsi/scsi_ses.c
 SES_ENABLE_PASSTHROUGH	opt_ses.h
 
 # Options used in dev/sym/ (Symbios SCSI driver).
 SYM_SETUP_SCSI_DIFF	opt_sym.h	#-HVD support for 825a, 875, 885
 					# disabled:0 (default), enabled:1
 SYM_SETUP_PCI_PARITY	opt_sym.h	#-PCI parity checking
 					# disabled:0, enabled:1 (default)
 SYM_SETUP_MAX_LUN	opt_sym.h	#-Number of LUNs supported
 					# default:8, range:[1..64]
 
 # Options used only in dev/isp/*
 ISP_TARGET_MODE		opt_isp.h
 ISP_FW_CRASH_DUMP	opt_isp.h
 ISP_DEFAULT_ROLES	opt_isp.h
 ISP_INTERNAL_TARGET	opt_isp.h
 ISP_FCTAPE_OFF		opt_isp.h
 
 # Options used only in dev/iscsi
 ISCSI_INITIATOR_DEBUG	opt_iscsi_initiator.h
 
 # Net stuff.
 ACCEPT_FILTER_DATA
 ACCEPT_FILTER_DNS
 ACCEPT_FILTER_HTTP
 ALTQ			opt_global.h
 ALTQ_CBQ		opt_altq.h
 ALTQ_CDNR		opt_altq.h
 ALTQ_CODEL		opt_altq.h
 ALTQ_DEBUG		opt_altq.h
 ALTQ_HFSC		opt_altq.h
 ALTQ_FAIRQ		opt_altq.h
 ALTQ_NOPCC		opt_altq.h
 ALTQ_PRIQ		opt_altq.h
 ALTQ_RED		opt_altq.h
 ALTQ_RIO		opt_altq.h
 BOOTP			opt_bootp.h
 BOOTP_BLOCKSIZE		opt_bootp.h
 BOOTP_COMPAT		opt_bootp.h
 BOOTP_NFSROOT		opt_bootp.h
 BOOTP_NFSV3		opt_bootp.h
 BOOTP_WIRED_TO		opt_bootp.h
 DEVICE_POLLING
 DUMMYNET		opt_ipdn.h
 RATELIMIT		opt_ratelimit.h
 RATELIMIT_DEBUG		opt_ratelimit.h
 INET			opt_inet.h
 INET6			opt_inet6.h
 IPDIVERT
 IPFILTER		opt_ipfilter.h
 IPFILTER_DEFAULT_BLOCK	opt_ipfilter.h
 IPFILTER_LOG		opt_ipfilter.h
 IPFILTER_LOOKUP		opt_ipfilter.h
 IPFIREWALL		opt_ipfw.h
 IPFIREWALL_DEFAULT_TO_ACCEPT	opt_ipfw.h
 IPFIREWALL_NAT		opt_ipfw.h
 IPFIREWALL_NAT64	opt_ipfw.h
 IPFIREWALL_NPTV6	opt_ipfw.h
 IPFIREWALL_VERBOSE	opt_ipfw.h
 IPFIREWALL_VERBOSE_LIMIT	opt_ipfw.h
 IPFIREWALL_PMOD		opt_ipfw.h
 IPSEC			opt_ipsec.h
 IPSEC_DEBUG		opt_ipsec.h
 IPSEC_SUPPORT		opt_ipsec.h
 IPSTEALTH
 KRPC
 LIBALIAS
 LIBMCHAIN
 MBUF_PROFILING
 MBUF_STRESS_TEST
 MROUTING		opt_mrouting.h
 NFSLOCKD
 PCBGROUP		opt_pcbgroup.h
 PF_DEFAULT_TO_DROP	opt_pf.h
 RADIX_MPATH		opt_mpath.h
 ROUTETABLES		opt_route.h
 RSS			opt_rss.h
 SLIP_IFF_OPTS		opt_slip.h
 TCPDEBUG
 TCPPCAP		opt_global.h
 SIFTR
 TCP_BLACKBOX		opt_global.h
 TCP_HHOOK		opt_inet.h
 TCP_OFFLOAD		opt_inet.h # Enable code to dispatch TCP offloading
 TCP_RFC7413		opt_inet.h
 TCP_RFC7413_MAX_KEYS	opt_inet.h
 TCP_RFC7413_MAX_PSKS	opt_inet.h
 TCP_SIGNATURE		opt_ipsec.h
 VLAN_ARRAY		opt_vlan.h
 XBONEHACK
 
 #
 # SCTP
 #
 SCTP			opt_sctp.h
 SCTP_DEBUG		opt_sctp.h # Enable debug printfs
 SCTP_LOCK_LOGGING	opt_sctp.h # Log to KTR lock activity
 SCTP_MBUF_LOGGING	opt_sctp.h # Log to KTR general mbuf aloc/free
 SCTP_MBCNT_LOGGING	opt_sctp.h # Log to KTR mbcnt activity
 SCTP_PACKET_LOGGING	opt_sctp.h # Log to a packet buffer last N packets
 SCTP_LTRACE_CHUNKS	opt_sctp.h # Log to KTR chunks processed
 SCTP_LTRACE_ERRORS	opt_sctp.h # Log to KTR error returns.
 SCTP_USE_PERCPU_STAT	opt_sctp.h # Use per cpu stats.
 SCTP_MCORE_INPUT	opt_sctp.h # Have multiple input threads for input mbufs
 SCTP_LOCAL_TRACE_BUF	opt_sctp.h # Use tracebuffer exported via sysctl
 SCTP_DETAILED_STR_STATS	opt_sctp.h # Use per PR-SCTP policy stream stats
 #
 #
 #
 
 # Netgraph(4). Use option NETGRAPH to enable the base netgraph code.
 # Each netgraph node type can be either be compiled into the kernel
 # or loaded dynamically. To get the former, include the corresponding
 # option below. Each type has its own man page, e.g. ng_async(4).
 NETGRAPH
 NETGRAPH_DEBUG		opt_netgraph.h
 NETGRAPH_ASYNC		opt_netgraph.h
 NETGRAPH_ATMLLC		opt_netgraph.h
 NETGRAPH_ATM_ATMPIF	opt_netgraph.h
 NETGRAPH_BLUETOOTH	opt_netgraph.h
 NETGRAPH_BLUETOOTH_BT3C	opt_netgraph.h
 NETGRAPH_BLUETOOTH_H4	opt_netgraph.h
 NETGRAPH_BLUETOOTH_HCI	opt_netgraph.h
 NETGRAPH_BLUETOOTH_L2CAP	opt_netgraph.h
 NETGRAPH_BLUETOOTH_SOCKET	opt_netgraph.h
 NETGRAPH_BLUETOOTH_UBT	opt_netgraph.h
 NETGRAPH_BLUETOOTH_UBTBCMFW	opt_netgraph.h
 NETGRAPH_BPF		opt_netgraph.h
 NETGRAPH_BRIDGE		opt_netgraph.h
 NETGRAPH_CAR		opt_netgraph.h
 NETGRAPH_CHECKSUM	opt_netgraph.h
 NETGRAPH_CISCO		opt_netgraph.h
 NETGRAPH_DEFLATE	opt_netgraph.h
 NETGRAPH_DEVICE		opt_netgraph.h
 NETGRAPH_ECHO		opt_netgraph.h
 NETGRAPH_EIFACE		opt_netgraph.h
 NETGRAPH_ETHER		opt_netgraph.h
 NETGRAPH_ETHER_ECHO	opt_netgraph.h
 NETGRAPH_FEC		opt_netgraph.h
 NETGRAPH_FRAME_RELAY	opt_netgraph.h
 NETGRAPH_GIF		opt_netgraph.h
 NETGRAPH_GIF_DEMUX	opt_netgraph.h
 NETGRAPH_HOLE		opt_netgraph.h
 NETGRAPH_IFACE		opt_netgraph.h
 NETGRAPH_IP_INPUT	opt_netgraph.h
 NETGRAPH_IPFW		opt_netgraph.h
 NETGRAPH_KSOCKET	opt_netgraph.h
 NETGRAPH_L2TP		opt_netgraph.h
 NETGRAPH_LMI		opt_netgraph.h
 NETGRAPH_MPPC_COMPRESSION	opt_netgraph.h
 NETGRAPH_MPPC_ENCRYPTION	opt_netgraph.h
 NETGRAPH_NAT		opt_netgraph.h
 NETGRAPH_NETFLOW	opt_netgraph.h
 NETGRAPH_ONE2MANY	opt_netgraph.h
 NETGRAPH_PATCH		opt_netgraph.h
 NETGRAPH_PIPE		opt_netgraph.h
 NETGRAPH_PPP		opt_netgraph.h
 NETGRAPH_PPPOE		opt_netgraph.h
 NETGRAPH_PPTPGRE	opt_netgraph.h
 NETGRAPH_PRED1		opt_netgraph.h
 NETGRAPH_RFC1490	opt_netgraph.h
 NETGRAPH_SOCKET		opt_netgraph.h
 NETGRAPH_SPLIT		opt_netgraph.h
 NETGRAPH_SPPP		opt_netgraph.h
 NETGRAPH_TAG		opt_netgraph.h
 NETGRAPH_TCPMSS		opt_netgraph.h
 NETGRAPH_TEE		opt_netgraph.h
 NETGRAPH_TTY		opt_netgraph.h
 NETGRAPH_UI		opt_netgraph.h
 NETGRAPH_VJC		opt_netgraph.h
 NETGRAPH_VLAN		opt_netgraph.h
 
 # NgATM options
 NGATM_ATM		opt_netgraph.h
 NGATM_ATMBASE		opt_netgraph.h
 NGATM_SSCOP		opt_netgraph.h
 NGATM_SSCFU		opt_netgraph.h
 NGATM_UNI		opt_netgraph.h
 NGATM_CCATM		opt_netgraph.h
 
 # DRM options
 DRM_DEBUG		opt_drm.h
 
 TI_SF_BUF_JUMBO		opt_ti.h
 TI_JUMBO_HDRSPLIT	opt_ti.h
 
 # Misc debug flags.  Most of these should probably be replaced with
 # 'DEBUG', and then let people recompile just the interesting modules
 # with 'make CC="cc -DDEBUG"'.
 CLUSTERDEBUG		opt_debug_cluster.h
 DEBUG_1284		opt_ppb_1284.h
 VP0_DEBUG		opt_vpo.h
 LPT_DEBUG		opt_lpt.h
 PLIP_DEBUG		opt_plip.h
 LOCKF_DEBUG		opt_debug_lockf.h
 SI_DEBUG		opt_debug_si.h
 IFMEDIA_DEBUG		opt_ifmedia.h
 
 # Fb options
 FB_DEBUG		opt_fb.h
 FB_INSTALL_CDEV		opt_fb.h
 
 # ppbus related options
 PERIPH_1284		opt_ppb_1284.h
 DONTPROBE_1284		opt_ppb_1284.h
 
 # smbus related options
 ENABLE_ALART		opt_intpm.h
 
 # These cause changes all over the kernel
 BLKDEV_IOSIZE		opt_global.h
 BURN_BRIDGES		opt_global.h
 DEBUG			opt_global.h
 DEBUG_LOCKS		opt_global.h
 DEBUG_VFS_LOCKS		opt_global.h
 DFLTPHYS		opt_global.h
 DIAGNOSTIC		opt_global.h
 INVARIANT_SUPPORT	opt_global.h
 INVARIANTS		opt_global.h
 KASSERT_PANIC_OPTIONAL	opt_global.h
 MAXCPU			opt_global.h
 MAXMEMDOM		opt_global.h
 MAXPHYS			opt_global.h
 MCLSHIFT		opt_global.h
 MUTEX_NOINLINE		opt_global.h
 LOCK_PROFILING		opt_global.h
 LOCK_PROFILING_FAST	opt_global.h
 MSIZE			opt_global.h
 REGRESSION		opt_global.h
 RWLOCK_NOINLINE		opt_global.h
 SX_NOINLINE		opt_global.h
 VFS_BIO_DEBUG		opt_global.h
 
 # These are VM related options
 VM_KMEM_SIZE		opt_vm.h
 VM_KMEM_SIZE_SCALE	opt_vm.h
 VM_KMEM_SIZE_MAX	opt_vm.h
 VM_NRESERVLEVEL		opt_vm.h
 VM_LEVEL_0_ORDER	opt_vm.h
 NO_SWAPPING		opt_vm.h
 MALLOC_MAKE_FAILURES	opt_vm.h
 MALLOC_PROFILE		opt_vm.h
 MALLOC_DEBUG_MAXZONES	opt_vm.h
+UMA_XDOMAIN		opt_vm.h
+UMA_FIRSTTOUCH		opt_vm.h
 
 # The MemGuard replacement allocator used for tamper-after-free detection
 DEBUG_MEMGUARD		opt_vm.h
 
 # The RedZone malloc(9) protection
 DEBUG_REDZONE		opt_vm.h
 
 # Standard SMP options
 EARLY_AP_STARTUP	opt_global.h
 SMP			opt_global.h
 NUMA			opt_global.h
 
 # Size of the kernel message buffer
 MSGBUF_SIZE		opt_msgbuf.h
 
 # NFS options
 NFS_MINATTRTIMO		opt_nfs.h
 NFS_MAXATTRTIMO		opt_nfs.h
 NFS_MINDIRATTRTIMO	opt_nfs.h
 NFS_MAXDIRATTRTIMO	opt_nfs.h
 NFS_DEBUG		opt_nfs.h
 
 # TMPFS options
 TMPFS_PAGES_MINRESERVED		opt_tmpfs.h
 
 # For the Bt848/Bt848A/Bt849/Bt878/Bt879 driver
 OVERRIDE_CARD			opt_bktr.h
 OVERRIDE_TUNER			opt_bktr.h
 OVERRIDE_DBX			opt_bktr.h
 OVERRIDE_MSP			opt_bktr.h
 BROOKTREE_SYSTEM_DEFAULT	opt_bktr.h
 BROOKTREE_ALLOC_PAGES		opt_bktr.h
 BKTR_OVERRIDE_CARD		opt_bktr.h
 BKTR_OVERRIDE_TUNER		opt_bktr.h
 BKTR_OVERRIDE_DBX		opt_bktr.h
 BKTR_OVERRIDE_MSP		opt_bktr.h
 BKTR_SYSTEM_DEFAULT		opt_bktr.h
 BKTR_ALLOC_PAGES		opt_bktr.h
 BKTR_USE_PLL			opt_bktr.h	
 BKTR_GPIO_ACCESS		opt_bktr.h
 BKTR_NO_MSP_RESET		opt_bktr.h
 BKTR_430_FX_MODE		opt_bktr.h
 BKTR_SIS_VIA_MODE		opt_bktr.h
 BKTR_USE_FREEBSD_SMBUS		opt_bktr.h
 BKTR_NEW_MSP34XX_DRIVER		opt_bktr.h
 
 # Options for uart(4)
 UART_PPS_ON_CTS		opt_uart.h
 UART_POLL_FREQ		opt_uart.h
 UART_DEV_TOLERANCE_PCT	opt_uart.h
 
 # options for bus/device framework
 BUS_DEBUG		opt_bus.h
 
 # options for USB support
 USB_DEBUG		opt_usb.h
 USB_HOST_ALIGN		opt_usb.h
 USB_REQ_DEBUG		opt_usb.h
 USB_TEMPLATE		opt_usb.h
 USB_VERBOSE		opt_usb.h
 USB_DMA_SINGLE_ALLOC	opt_usb.h
 USB_EHCI_BIG_ENDIAN_DESC	opt_usb.h
 U3G_DEBUG		opt_u3g.h
 UKBD_DFLT_KEYMAP	opt_ukbd.h
 UPLCOM_INTR_INTERVAL	opt_uplcom.h
 UVSCOM_DEFAULT_OPKTSIZE	opt_uvscom.h
 UVSCOM_INTR_INTERVAL	opt_uvscom.h
 
 # options for the Realtek rtwn driver
 RTWN_DEBUG		opt_rtwn.h
 RTWN_WITHOUT_UCODE	opt_rtwn.h
 
 # Embedded system options
 INIT_PATH
 
 ROOTDEVNAME
 
 FDC_DEBUG 		opt_fdc.h
 PCFCLOCK_VERBOSE	opt_pcfclock.h
 PCFCLOCK_MAX_RETRIES	opt_pcfclock.h
 
 KTR			opt_global.h
 KTR_ALQ			opt_ktr.h
 KTR_MASK		opt_ktr.h
 KTR_CPUMASK		opt_ktr.h
 KTR_COMPILE		opt_global.h
 KTR_BOOT_ENTRIES	opt_global.h
 KTR_ENTRIES		opt_global.h
 KTR_VERBOSE		opt_ktr.h
 WITNESS			opt_global.h
 WITNESS_KDB		opt_witness.h
 WITNESS_NO_VNODE	opt_witness.h
 WITNESS_SKIPSPIN	opt_witness.h
 WITNESS_COUNT		opt_witness.h
 OPENSOLARIS_WITNESS	opt_global.h
 
 # options for ACPI support
 ACPI_DEBUG		opt_acpi.h
 ACPI_MAX_TASKS		opt_acpi.h
 ACPI_MAX_THREADS	opt_acpi.h
 ACPI_DMAR		opt_acpi.h
 DEV_ACPI		opt_acpi.h
 
 # ISA support
 DEV_ISA			opt_isa.h
 ISAPNP			opt_isa.h
 
 # various 'device presence' options.
 DEV_BPF			opt_bpf.h
 DEV_CARP		opt_carp.h
 DEV_NETMAP		opt_global.h
 DEV_PCI			opt_pci.h
 DEV_PF			opt_pf.h
 DEV_PFLOG		opt_pf.h
 DEV_PFSYNC		opt_pf.h
 DEV_SPLASH		opt_splash.h
 DEV_VLAN		opt_vlan.h
 
 # ed driver
 ED_HPP			opt_ed.h
 ED_3C503		opt_ed.h
 ED_SIC			opt_ed.h
 
 # bce driver
 BCE_DEBUG		opt_bce.h
 BCE_NVRAM_WRITE_SUPPORT	opt_bce.h
 
 SOCKBUF_DEBUG		opt_global.h
 
 
 # options for ubsec driver
 UBSEC_DEBUG		opt_ubsec.h
 UBSEC_RNDTEST		opt_ubsec.h
 UBSEC_NO_RNG		opt_ubsec.h
 
 # options for hifn driver
 HIFN_DEBUG		opt_hifn.h
 HIFN_RNDTEST		opt_hifn.h
 
 # options for safenet driver
 SAFE_DEBUG		opt_safe.h
 SAFE_NO_RNG		opt_safe.h
 SAFE_RNDTEST		opt_safe.h
 
 # syscons/vt options
 MAXCONS			opt_syscons.h
 SC_ALT_MOUSE_IMAGE	opt_syscons.h
 SC_CUT_SPACES2TABS	opt_syscons.h
 SC_CUT_SEPCHARS		opt_syscons.h
 SC_DEBUG_LEVEL		opt_syscons.h
 SC_DFLT_FONT		opt_syscons.h
 SC_DFLT_TERM		opt_syscons.h
 SC_DISABLE_KDBKEY	opt_syscons.h
 SC_DISABLE_REBOOT	opt_syscons.h
 SC_HISTORY_SIZE		opt_syscons.h
 SC_KERNEL_CONS_ATTR	opt_syscons.h
 SC_KERNEL_CONS_ATTRS	opt_syscons.h
 SC_KERNEL_CONS_REV_ATTR	opt_syscons.h
 SC_MOUSE_CHAR		opt_syscons.h
 SC_NO_CUTPASTE		opt_syscons.h
 SC_NO_FONT_LOADING	opt_syscons.h
 SC_NO_HISTORY		opt_syscons.h
 SC_NO_MODE_CHANGE	opt_syscons.h
 SC_NO_SUSPEND_VTYSWITCH	opt_syscons.h
 SC_NO_SYSMOUSE		opt_syscons.h
 SC_NO_TERM_DUMB		opt_syscons.h
 SC_NO_TERM_SC		opt_syscons.h
 SC_NO_TERM_TEKEN	opt_syscons.h
 SC_NORM_ATTR		opt_syscons.h
 SC_NORM_REV_ATTR	opt_syscons.h
 SC_PIXEL_MODE		opt_syscons.h
 SC_RENDER_DEBUG		opt_syscons.h
 SC_TWOBUTTON_MOUSE	opt_syscons.h
 VT_ALT_TO_ESC_HACK	opt_syscons.h
 VT_FB_DEFAULT_WIDTH	opt_syscons.h
 VT_FB_DEFAULT_HEIGHT	opt_syscons.h
 VT_MAXWINDOWS		opt_syscons.h
 VT_TWOBUTTON_MOUSE	opt_syscons.h
 DEV_SC			opt_syscons.h
 DEV_VT			opt_syscons.h
 
 # teken terminal emulator options
 TEKEN_CONS25		opt_teken.h
 TEKEN_UTF8		opt_teken.h
 TERMINAL_KERN_ATTR	opt_teken.h
 TERMINAL_NORM_ATTR	opt_teken.h
 
 # options for printf
 PRINTF_BUFR_SIZE	opt_printf.h
 BOOT_TAG		opt_printf.h
 BOOT_TAG_SZ		opt_printf.h
 
 # kbd options
 KBD_DISABLE_KEYMAP_LOAD	opt_kbd.h
 KBD_INSTALL_CDEV	opt_kbd.h
 KBD_MAXRETRY		opt_kbd.h
 KBD_MAXWAIT		opt_kbd.h
 KBD_RESETDELAY		opt_kbd.h
 KBDIO_DEBUG		opt_kbd.h
 
 KBDMUX_DFLT_KEYMAP	opt_kbdmux.h
 
 # options for the Atheros driver
 ATH_DEBUG		opt_ath.h
 ATH_TXBUF		opt_ath.h
 ATH_RXBUF		opt_ath.h
 ATH_DIAGAPI		opt_ath.h
 ATH_TX99_DIAG		opt_ath.h
 ATH_ENABLE_11N		opt_ath.h
 ATH_ENABLE_DFS		opt_ath.h
 ATH_EEPROM_FIRMWARE	opt_ath.h
 ATH_ENABLE_RADIOTAP_VENDOR_EXT	opt_ath.h
 ATH_DEBUG_ALQ		opt_ath.h
 ATH_KTR_INTR_DEBUG	opt_ath.h
 
 # options for the Atheros hal
 # XXX For now, this breaks non-AR9130 chipsets, so only use it
 # XXX when actually targeting AR9130.
 AH_SUPPORT_AR9130	opt_ah.h
 
 # This is required for AR933x SoC support
 AH_SUPPORT_AR9330	opt_ah.h
 AH_SUPPORT_AR9340	opt_ah.h
 AH_SUPPORT_QCA9530	opt_ah.h
 AH_SUPPORT_QCA9550	opt_ah.h
 
 AH_DEBUG		opt_ah.h
 AH_ASSERT		opt_ah.h
 AH_DEBUG_ALQ		opt_ah.h
 AH_REGOPS_FUNC		opt_ah.h
 AH_WRITE_REGDOMAIN	opt_ah.h
 AH_DEBUG_COUNTRY	opt_ah.h
 AH_WRITE_EEPROM		opt_ah.h
 AH_PRIVATE_DIAG		opt_ah.h
 AH_NEED_DESC_SWAP	opt_ah.h
 AH_USE_INIPDGAIN	opt_ah.h
 AH_MAXCHAN		opt_ah.h
 AH_RXCFG_SDMAMW_4BYTES	opt_ah.h
 AH_INTERRUPT_DEBUGGING	opt_ah.h
 # AR5416 and later interrupt mitigation
 # XXX do not use this for AR9130
 AH_AR5416_INTERRUPT_MITIGATION	opt_ah.h
 
 # options for the Altera mSGDMA driver (altera_msgdma)
 ALTERA_MSGDMA_DESC_STD		opt_altera_msgdma.h
 ALTERA_MSGDMA_DESC_EXT		opt_altera_msgdma.h
 ALTERA_MSGDMA_DESC_PF_STD	opt_altera_msgdma.h
 ALTERA_MSGDMA_DESC_PF_EXT	opt_altera_msgdma.h
 
 # options for the Broadcom BCM43xx driver (bwi)
 BWI_DEBUG		opt_bwi.h
 BWI_DEBUG_VERBOSE	opt_bwi.h
 
 # options for the Brodacom BCM43xx driver (bwn)
 BWN_DEBUG		opt_bwn.h
 BWN_GPL_PHY		opt_bwn.h
 BWN_USE_SIBA		opt_bwn.h
 
 # Options for the SIBA driver
 SIBA_DEBUG		opt_siba.h
 
 # options for the Marvell 8335 wireless driver
 MALO_DEBUG		opt_malo.h
 MALO_TXBUF		opt_malo.h
 MALO_RXBUF		opt_malo.h
 
 # options for the Marvell wireless driver
 MWL_DEBUG		opt_mwl.h
 MWL_TXBUF		opt_mwl.h
 MWL_RXBUF		opt_mwl.h
 MWL_DIAGAPI		opt_mwl.h
 MWL_AGGR_SIZE		opt_mwl.h
 MWL_TX_NODROP		opt_mwl.h
 
 # Options for the Marvell NETA driver
 MVNETA_MULTIQUEUE	opt_mvneta.h
 MVNETA_KTR		opt_mvneta.h
 
 # Options for the Intel 802.11ac wireless driver
 IWM_DEBUG		opt_iwm.h
 
 # Options for the Intel 802.11n wireless driver
 IWN_DEBUG		opt_iwn.h
 
 # Options for the Intel 3945ABG wireless driver
 WPI_DEBUG		opt_wpi.h
 
 # dcons options 
 DCONS_BUF_SIZE		opt_dcons.h
 DCONS_POLL_HZ		opt_dcons.h
 DCONS_FORCE_CONSOLE	opt_dcons.h
 DCONS_FORCE_GDB		opt_dcons.h
 
 # HWPMC options
 HWPMC_DEBUG		opt_global.h
 HWPMC_HOOKS
 HWPMC_MIPS_BACKTRACE 	opt_hwpmc_hooks.h
 
 # 802.11 support layer
 IEEE80211_DEBUG		opt_wlan.h
 IEEE80211_DEBUG_REFCNT	opt_wlan.h
 IEEE80211_SUPPORT_MESH	opt_wlan.h
 IEEE80211_SUPPORT_SUPERG	opt_wlan.h
 IEEE80211_SUPPORT_TDMA	opt_wlan.h
 IEEE80211_ALQ		opt_wlan.h
 IEEE80211_DFS_DEBUG	opt_wlan.h
 
 # 802.11 TDMA support
 TDMA_SLOTLEN_DEFAULT	opt_tdma.h
 TDMA_SLOTCNT_DEFAULT	opt_tdma.h
 TDMA_BINTVAL_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11B_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11G_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11A_DEFAULT	opt_tdma.h
 TDMA_TXRATE_TURBO_DEFAULT	opt_tdma.h
 TDMA_TXRATE_HALF_DEFAULT	opt_tdma.h
 TDMA_TXRATE_QUARTER_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11NA_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11NG_DEFAULT	opt_tdma.h
 
 # VideoMode
 PICKMODE_DEBUG			opt_videomode.h
 
 # Network stack virtualization options
 VIMAGE			opt_global.h
 VNET_DEBUG		opt_global.h
 
 # Common Flash Interface (CFI) options
 CFI_SUPPORT_STRATAFLASH	opt_cfi.h
 CFI_ARMEDANDDANGEROUS	opt_cfi.h
 CFI_HARDWAREBYTESWAP	opt_cfi.h
 
 # Sound options
 SND_DEBUG		opt_snd.h
 SND_DIAGNOSTIC		opt_snd.h
 SND_FEEDER_MULTIFORMAT	opt_snd.h
 SND_FEEDER_FULL_MULTIFORMAT	opt_snd.h
 SND_FEEDER_RATE_HP	opt_snd.h
 SND_PCM_64		opt_snd.h
 SND_OLDSTEREO		opt_snd.h
 
 X86BIOS
 
 # Flattened device tree options
 FDT		opt_platform.h
 FDT_DTB_STATIC	opt_platform.h
 
 # OFED Infiniband stack
 OFED		opt_ofed.h
 OFED_DEBUG_INIT	opt_ofed.h
 SDP		opt_ofed.h
 SDP_DEBUG	opt_ofed.h
 IPOIB		opt_ofed.h
 IPOIB_DEBUG	opt_ofed.h
 IPOIB_CM	opt_ofed.h
 
 # Resource Accounting
 RACCT		opt_global.h
 RACCT_DEFAULT_TO_DISABLED	opt_global.h
 
 # Resource Limits
 RCTL		opt_global.h
 
 # Random number generator(s)
 # With this, no entropy processor is loaded, but the entropy
 # harvesting infrastructure is present. This means an entropy
 # processor may be loaded as a module.
 RANDOM_LOADABLE	opt_global.h
 # This turns on high-rate and potentially expensive harvesting in
 # the uma slab allocator.
 RANDOM_ENABLE_UMA	opt_global.h
 RANDOM_ENABLE_ETHER	opt_global.h
 
 # This options turns TPM into entropy source.
 TPM_HARVEST	opt_tpm.h
 
 # BHND(4) driver
 BHND_LOGLEVEL	opt_global.h
 
 # GPIO and child devices
 GPIO_SPI_DEBUG	opt_gpio.h
 
 # SPI devices
 SPIGEN_LEGACY_CDEVNAME	opt_spi.h
 
 # etherswitch(4) driver
 RTL8366_SOFT_RESET opt_etherswitch.h
 
 # evdev protocol support
 EVDEV_SUPPORT	opt_evdev.h
 EVDEV_DEBUG	opt_evdev.h
 UINPUT_DEBUG	opt_evdev.h
 
 # Hyper-V network driver
 HN_DEBUG	opt_hn.h
 
 # CAM-based MMC stack
 MMCCAM
 # Encrypted kernel crash dumps
 EKCD		opt_ekcd.h
 
 # NVME options
 NVME_USE_NVD	opt_nvme.h
 
 # amdsbwd options
 AMDSBWD_DEBUG	opt_amdsbwd.h
 
 # gcov support
 GCOV		opt_global.h
 LINDEBUGFS
Index: head/sys/vm/uma.h
===================================================================
--- head/sys/vm/uma.h	(revision 350658)
+++ head/sys/vm/uma.h	(revision 350659)
@@ -1,713 +1,714 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 /*
  * uma.h - External definitions for the Universal Memory Allocator
  *
 */
 
 #ifndef _VM_UMA_H_
 #define _VM_UMA_H_
 
 #include <sys/param.h>		/* For NULL */
 #include <sys/malloc.h>		/* For M_* */
 
 /* User visible parameters */
 #define UMA_SMALLEST_UNIT       (PAGE_SIZE / 256) /* Smallest item allocated */
 
 /* Types and type defs */
 
 struct uma_zone;
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;
 
 void zone_drain(uma_zone_t);
 
 /*
  * Item constructor
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	arg   The arg field passed to uma_zalloc_arg
  *	size  The size of the allocated item
  *	flags See zalloc flags
  *
  * Returns:
  *	0      on success
  *      errno  on failure
  *
  * Discussion:
  *	The constructor is called just before the memory is returned
  *	to the user. It may block if necessary.
  */
 typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags);
 
 /*
  * Item destructor
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	size  The size of the item being destructed.
  *	arg   Argument passed through uma_zfree_arg
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	The destructor may perform operations that differ from those performed
  *	by the initializer, but it must leave the object in the same state.
  *	This IS type stable storage.  This is called after EVERY zfree call.
  */
 typedef void (*uma_dtor)(void *mem, int size, void *arg);
 
 /*
  * Item initializer
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	size  The size of the item being initialized.
  *	flags See zalloc flags
  *
  * Returns:
  *	0      on success
  *      errno  on failure
  *
  * Discussion:
  *	The initializer is called when the memory is cached in the uma zone.
  *	The initializer and the destructor should leave the object in the same
  *	state.
  */
 typedef int (*uma_init)(void *mem, int size, int flags);
 
 /*
  * Item discard function
  *
  * Arguments:
  *	item  A pointer to memory which has been 'freed' but has not left the
  *	      zone's cache.
  *	size  The size of the item being discarded.
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	This routine is called when memory leaves a zone and is returned to the
  *	system for other uses.  It is the counter-part to the init function.
  */
 typedef void (*uma_fini)(void *mem, int size);
 
 /*
  * Import new memory into a cache zone.
  */
 typedef int (*uma_import)(void *arg, void **store, int count, int domain,
     int flags);
 
 /*
  * Free memory from a cache zone.
  */
 typedef void (*uma_release)(void *arg, void **store, int count);
 
 /*
  * What's the difference between initializing and constructing?
  *
  * The item is initialized when it is cached, and this is the state that the
  * object should be in when returned to the allocator. The purpose of this is
  * to remove some code which would otherwise be called on each allocation by
  * utilizing a known, stable state.  This differs from the constructor which
  * will be called on EVERY allocation.
  *
  * For example, in the initializer you may want to initialize embedded locks,
  * NULL list pointers, set up initial states, magic numbers, etc.  This way if
  * the object is held in the allocator and re-used it won't be necessary to
  * re-initialize it.
  *
  * The constructor may be used to lock a data structure, link it on to lists,
  * bump reference counts or total counts of outstanding structures, etc.
  *
  */
 
 
 /* Function proto types */
 
 /*
  * Create a new uma zone
  *
  * Arguments:
  *	name  The text name of the zone for debugging and stats. This memory
  *		should not be freed until the zone has been deallocated.
  *	size  The size of the object that is being created.
  *	ctor  The constructor that is called when the object is allocated.
  *	dtor  The destructor that is called when the object is freed.
  *	init  An initializer that sets up the initial state of the memory.
  *	fini  A discard function that undoes initialization done by init.
  *		ctor/dtor/init/fini may all be null, see notes above.
  *	align A bitmask that corresponds to the requested alignment
  *		eg 4 would be 0x3
  *	flags A set of parameters that control the behavior of the zone.
  *
  * Returns:
  *	A pointer to a structure which is intended to be opaque to users of
  *	the interface.  The value may be null if the wait flag is not set.
  */
 uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor,
 		    uma_dtor dtor, uma_init uminit, uma_fini fini,
 		    int align, uint32_t flags);
 
 /*
  * Create a secondary uma zone
  *
  * Arguments:
  *	name  The text name of the zone for debugging and stats. This memory
  *		should not be freed until the zone has been deallocated.
  *	ctor  The constructor that is called when the object is allocated.
  *	dtor  The destructor that is called when the object is freed.
  *	zinit  An initializer that sets up the initial state of the memory
  *		as the object passes from the Keg's slab to the Zone's cache.
  *	zfini  A discard function that undoes initialization done by init
  *		as the object passes from the Zone's cache to the Keg's slab.
  *
  *		ctor/dtor/zinit/zfini may all be null, see notes above.
  *		Note that the zinit and zfini specified here are NOT
  *		exactly the same as the init/fini specified to uma_zcreate()
  *		when creating a master zone.  These zinit/zfini are called
  *		on the TRANSITION from keg to zone (and vice-versa). Once
  *		these are set, the primary zone may alter its init/fini
  *		(which are called when the object passes from VM to keg)
  *		using uma_zone_set_init/fini()) as well as its own
  *		zinit/zfini (unset by default for master zone) with
  *		uma_zone_set_zinit/zfini() (note subtle 'z' prefix).
  *
  *	master  A reference to this zone's Master Zone (Primary Zone),
  *		which contains the backing Keg for the Secondary Zone
  *		being added.
  *
  * Returns:
  *	A pointer to a structure which is intended to be opaque to users of
  *	the interface.  The value may be null if the wait flag is not set.
  */
 uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_zone_t master);
 
 /*
  * Create cache-only zones.
  *
  * This allows uma's per-cpu cache facilities to handle arbitrary
  * pointers.  Consumers must specify the import and release functions to
  * fill and destroy caches.  UMA does not allocate any memory for these
  * zones.  The 'arg' parameter is passed to import/release and is caller
  * specific.
  */
 uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_import zimport,
 		    uma_release zrelease, void *arg, int flags);
 
 /*
  * Definitions for uma_zcreate flags
  *
  * These flags share space with UMA_ZFLAGs in uma_int.h.  Be careful not to
  * overlap when adding new features.  0xff000000 is in use by uma_int.h.
  */
 #define UMA_ZONE_PAGEABLE	0x0001	/* Return items not fully backed by
 					   physical memory XXX Not yet */
 #define UMA_ZONE_ZINIT		0x0002	/* Initialize with zeros */
 #define UMA_ZONE_STATIC		0x0004	/* Statically sized zone */
 #define UMA_ZONE_OFFPAGE	0x0008	/* Force the slab structure allocation
 					   off of the real memory */
 #define UMA_ZONE_MALLOC		0x0010	/* For use by malloc(9) only! */
 #define UMA_ZONE_NOFREE		0x0020	/* Do not free slabs of this type! */
 #define UMA_ZONE_MTXCLASS	0x0040	/* Create a new lock class */
 #define	UMA_ZONE_VM		0x0080	/*
 					 * Used for internal vm datastructures
 					 * only.
 					 */
 #define	UMA_ZONE_HASH		0x0100	/*
 					 * Use a hash table instead of caching
 					 * information in the vm_page.
 					 */
 #define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
 #define	UMA_ZONE_NOBUCKET	0x0400	/* Do not use buckets. */
 #define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets. */
 #define	UMA_ZONE_CACHESPREAD	0x1000	/*
 					 * Spread memory start locations across
 					 * all possible cache lines.  May
 					 * require many virtually contiguous
 					 * backend pages and can fail early.
 					 */
 #define	UMA_ZONE_VTOSLAB	0x2000	/* Zone uses vtoslab for lookup. */
 #define	UMA_ZONE_NODUMP		0x4000	/*
 					 * Zone's pages will not be included in
 					 * mini-dumps.
 					 */
 #define	UMA_ZONE_PCPU		0x8000	/*
 					 * Allocates mp_maxid + 1 slabs of PAGE_SIZE
 					 */
 #define	UMA_ZONE_NUMA		0x10000	/*
 					 * NUMA aware Zone.  Implements a best
 					 * effort first-touch policy.
 					 */
 
 /*
  * These flags are shared between the keg and zone.  In zones wishing to add
  * new kegs these flags must be compatible.  Some are determined based on
  * physical parameters of the request and may not be provided by the consumer.
  */
 #define	UMA_ZONE_INHERIT						\
     (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE |		\
-    UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
+    UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU | UMA_ZONE_NUMA)
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
 #define UMA_ALIGN_LONG	(sizeof(long) - 1)	/* "" long */
 #define UMA_ALIGN_INT	(sizeof(int) - 1)	/* "" int */
 #define UMA_ALIGN_SHORT	(sizeof(short) - 1)	/* "" short */
 #define UMA_ALIGN_CHAR	(sizeof(char) - 1)	/* "" char */
 #define UMA_ALIGN_CACHE	(0 - 1)			/* Cache line size align */
 #define	UMA_ALIGNOF(type) (_Alignof(type) - 1)	/* Alignment fit for 'type' */
 
 /*
  * Destroys an empty uma zone.  If the zone is not empty uma complains loudly.
  *
  * Arguments:
  *	zone  The zone we want to destroy.
  *
  */
 void uma_zdestroy(uma_zone_t zone);
 
 /*
  * Allocates an item out of a zone
  *
  * Arguments:
  *	zone  The zone we are allocating from
  *	arg   This data is passed to the ctor function
  *	flags See sys/malloc.h for available flags.
  *
  * Returns:
  *	A non-null pointer to an initialized element from the zone is
  *	guaranteed if the wait flag is M_WAITOK.  Otherwise a null pointer
  *	may be returned if the zone is empty or the ctor failed.
  */
 
 void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags);
 void *uma_zalloc_pcpu_arg(uma_zone_t zone, void *arg, int flags);
 
 /*
  * Allocate an item from a specific NUMA domain.  This uses a slow path in
  * the allocator but is guaranteed to allocate memory from the requested
  * domain if M_WAITOK is set.
  *
  * Arguments:
  *	zone  The zone we are allocating from
  *	arg   This data is passed to the ctor function
  *	domain The domain to allocate from.
  *	flags See sys/malloc.h for available flags.
  */
 void *uma_zalloc_domain(uma_zone_t zone, void *arg, int domain, int flags);
 
 /*
  * Allocates an item out of a zone without supplying an argument
  *
  * This is just a wrapper for uma_zalloc_arg for convenience.
  *
  */
 static __inline void *uma_zalloc(uma_zone_t zone, int flags);
 static __inline void *uma_zalloc_pcpu(uma_zone_t zone, int flags);
 
 static __inline void *
 uma_zalloc(uma_zone_t zone, int flags)
 {
 	return uma_zalloc_arg(zone, NULL, flags);
 }
 
 static __inline void *
 uma_zalloc_pcpu(uma_zone_t zone, int flags)
 {
 	return uma_zalloc_pcpu_arg(zone, NULL, flags);
 }
 
 /*
  * Frees an item back into the specified zone.
  *
  * Arguments:
  *	zone  The zone the item was originally allocated out of.
  *	item  The memory to be freed.
  *	arg   Argument passed to the destructor
  *
  * Returns:
  *	Nothing.
  */
 
 void uma_zfree_arg(uma_zone_t zone, void *item, void *arg);
 void uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *arg);
 
 /*
  * Frees an item back to the specified zone's domain specific pool.
  *
  * Arguments:
  *	zone  The zone the item was originally allocated out of.
  *	item  The memory to be freed.
  *	arg   Argument passed to the destructor
  */
 void uma_zfree_domain(uma_zone_t zone, void *item, void *arg);
 
 /*
  * Frees an item back to a zone without supplying an argument
  *
  * This is just a wrapper for uma_zfree_arg for convenience.
  *
  */
 static __inline void uma_zfree(uma_zone_t zone, void *item);
 static __inline void uma_zfree_pcpu(uma_zone_t zone, void *item);
 
 static __inline void
 uma_zfree(uma_zone_t zone, void *item)
 {
 	uma_zfree_arg(zone, item, NULL);
 }
 
 static __inline void
 uma_zfree_pcpu(uma_zone_t zone, void *item)
 {
 	uma_zfree_pcpu_arg(zone, item, NULL);
 }
 
 /*
  * Wait until the specified zone can allocate an item.
  */
 void uma_zwait(uma_zone_t zone);
 
 /*
  * Backend page supplier routines
  *
  * Arguments:
  *	zone  The zone that is requesting pages.
  *	size  The number of bytes being requested.
  *	pflag Flags for these memory pages, see below.
  *	domain The NUMA domain that we prefer for this allocation.
  *	wait  Indicates our willingness to block.
  *
  * Returns:
  *	A pointer to the allocated memory or NULL on failure.
  */
 
 typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain,
     uint8_t *pflag, int wait);
 
 /*
  * Backend page free routines
  *
  * Arguments:
  *	item  A pointer to the previously allocated pages.
  *	size  The original size of the allocation.
  *	pflag The flags for the slab.  See UMA_SLAB_* below.
  *
  * Returns:
  *	None
  */
 typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
 
 /*
  * Reclaims unused memory for all zones
  *
  * Arguments:
  *	None
  * Returns:
  *	None
  *
  * This should only be called by the page out daemon.
  */
 
 void uma_reclaim(void);
 
 /*
  * Sets the alignment mask to be used for all zones requesting cache
  * alignment.  Should be called by MD boot code prior to starting VM/UMA.
  *
  * Arguments:
  *	align The alignment mask
  *
  * Returns:
  *	Nothing
  */
 void uma_set_align(int align);
 
 /*
  * Set a reserved number of items to hold for M_USE_RESERVE allocations.  All
  * other requests must allocate new backing pages.
  */
 void uma_zone_reserve(uma_zone_t zone, int nitems);
 
 /*
  * Reserves the maximum KVA space required by the zone and configures the zone
  * to use a VM_ALLOC_NOOBJ-based backend allocator.
  *
  * Arguments:
  *	zone  The zone to update.
  *	nitems  The upper limit on the number of items that can be allocated.
  *
  * Returns:
  *	0  if KVA space can not be allocated
  *	1  if successful
  *
  * Discussion:
  *	When the machine supports a direct map and the zone's items are smaller
  *	than a page, the zone will use the direct map instead of allocating KVA
  *	space.
  */
 int uma_zone_reserve_kva(uma_zone_t zone, int nitems);
 
 /*
  * Sets a high limit on the number of items allowed in a zone
  *
  * Arguments:
  *	zone  The zone to limit
  *	nitems  The requested upper limit on the number of items allowed
  *
  * Returns:
  *	int  The effective value of nitems after rounding up based on page size
  */
 int uma_zone_set_max(uma_zone_t zone, int nitems);
 
 /*
  * Sets a high limit on the number of items allowed in zone's bucket cache
  *
  * Arguments:
  *      zone  The zone to limit
  *      nitems  The requested upper limit on the number of items allowed
  *
  * Returns:
  *      int  The effective value of nitems set
  */
 int uma_zone_set_maxcache(uma_zone_t zone, int nitems);
 
 /*
  * Obtains the effective limit on the number of items in a zone
  *
  * Arguments:
  *	zone  The zone to obtain the effective limit from
  *
  * Return:
  *	0  No limit
  *	int  The effective limit of the zone
  */
 int uma_zone_get_max(uma_zone_t zone);
 
 /*
  * Sets a warning to be printed when limit is reached
  *
  * Arguments:
  *	zone  The zone we will warn about
  *	warning  Warning content
  *
  * Returns:
  *	Nothing
  */
 void uma_zone_set_warning(uma_zone_t zone, const char *warning);
 
 /*
  * Sets a function to run when limit is reached
  *
  * Arguments:
  *	zone  The zone to which this applies
  *	fx  The function ro run
  *
  * Returns:
  *	Nothing
  */
 typedef void (*uma_maxaction_t)(uma_zone_t, int);
 void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t);
 
 /*
  * Obtains the approximate current number of items allocated from a zone
  *
  * Arguments:
  *	zone  The zone to obtain the current allocation count from
  *
  * Return:
  *	int  The approximate current number of items allocated from the zone
  */
 int uma_zone_get_cur(uma_zone_t zone);
 
 /*
  * The following two routines (uma_zone_set_init/fini)
  * are used to set the backend init/fini pair which acts on an
  * object as it becomes allocated and is placed in a slab within
  * the specified zone's backing keg.  These should probably not
  * be changed once allocations have already begun, but only be set
  * immediately upon zone creation.
  */
 void uma_zone_set_init(uma_zone_t zone, uma_init uminit);
 void uma_zone_set_fini(uma_zone_t zone, uma_fini fini);
 
 /*
  * The following two routines (uma_zone_set_zinit/zfini) are
  * used to set the zinit/zfini pair which acts on an object as
  * it passes from the backing Keg's slab cache to the
  * specified Zone's bucket cache.  These should probably not
  * be changed once allocations have already begun, but only be set
  * immediately upon zone creation.
  */
 void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit);
 void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
 
 /*
  * Replaces the standard backend allocator for this zone.
  *
  * Arguments:
  *	zone   The zone whose backend allocator is being changed.
  *	allocf A pointer to the allocation function
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	This could be used to implement pageable allocation, or perhaps
  *	even DMA allocators if used in conjunction with the OFFPAGE
  *	zone flag.
  */
 
 void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf);
 
 /*
  * Used for freeing memory provided by the allocf above
  *
  * Arguments:
  *	zone  The zone that intends to use this free routine.
  *	freef The page freeing routine.
  *
  * Returns:
  *	Nothing
  */
 
 void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
 
 /*
  * These flags are setable in the allocf and visible in the freef.
  */
 #define UMA_SLAB_BOOT	0x01		/* Slab alloced from boot pages */
 #define UMA_SLAB_KERNEL	0x04		/* Slab alloced from kmem */
 #define UMA_SLAB_PRIV	0x08		/* Slab alloced from priv allocator */
 #define UMA_SLAB_OFFP	0x10		/* Slab is managed separately  */
 #define UMA_SLAB_MALLOC	0x20		/* Slab is a large malloc slab */
 /* 0x02, 0x40, and 0x80 are available */
 
 /*
  * Used to pre-fill a zone with some number of items
  *
  * Arguments:
  *	zone    The zone to fill
  *	itemcnt The number of items to reserve
  *
  * Returns:
  *	Nothing
  *
  * NOTE: This is blocking and should only be done at startup
  */
 void uma_prealloc(uma_zone_t zone, int itemcnt);
 
 /*
  * Used to determine if a fixed-size zone is exhausted.
  *
  * Arguments:
  *	zone    The zone to check
  *
  * Returns:
  *	Non-zero if zone is exhausted.
  */
 int uma_zone_exhausted(uma_zone_t zone);
 int uma_zone_exhausted_nolock(uma_zone_t zone);
 
 /*
  * Common UMA_ZONE_PCPU zones.
  */
 extern uma_zone_t pcpu_zone_64;
 extern uma_zone_t pcpu_zone_ptr;
 
 /*
  * Exported statistics structures to be used by user space monitoring tools.
  * Statistics stream consists of a uma_stream_header, followed by a series of
  * alternative uma_type_header and uma_type_stat structures.
  */
 #define	UMA_STREAM_VERSION	0x00000001
 struct uma_stream_header {
 	uint32_t	ush_version;	/* Stream format version. */
 	uint32_t	ush_maxcpus;	/* Value of MAXCPU for stream. */
 	uint32_t	ush_count;	/* Number of records. */
 	uint32_t	_ush_pad;	/* Pad/reserved field. */
 };
 
 #define	UTH_MAX_NAME	32
 #define	UTH_ZONE_SECONDARY	0x00000001
 struct uma_type_header {
 	/*
 	 * Static per-zone data, some extracted from the supporting keg.
 	 */
 	char		uth_name[UTH_MAX_NAME];
 	uint32_t	uth_align;	/* Keg: alignment. */
 	uint32_t	uth_size;	/* Keg: requested size of item. */
 	uint32_t	uth_rsize;	/* Keg: real size of item. */
 	uint32_t	uth_maxpages;	/* Keg: maximum number of pages. */
 	uint32_t	uth_limit;	/* Keg: max items to allocate. */
 
 	/*
 	 * Current dynamic zone/keg-derived statistics.
 	 */
 	uint32_t	uth_pages;	/* Keg: pages allocated. */
 	uint32_t	uth_keg_free;	/* Keg: items free. */
 	uint32_t	uth_zone_free;	/* Zone: items free. */
 	uint32_t	uth_bucketsize;	/* Zone: desired bucket size. */
 	uint32_t	uth_zone_flags;	/* Zone: flags. */
 	uint64_t	uth_allocs;	/* Zone: number of allocations. */
 	uint64_t	uth_frees;	/* Zone: number of frees. */
 	uint64_t	uth_fails;	/* Zone: number of alloc failures. */
 	uint64_t	uth_sleeps;	/* Zone: number of alloc sleeps. */
-	uint64_t	_uth_reserved1[2];	/* Reserved. */
+	uint64_t	uth_xdomain;	/* Zone: Number of cross domain frees. */
+	uint64_t	_uth_reserved1[1];	/* Reserved. */
 };
 
 struct uma_percpu_stat {
 	uint64_t	ups_allocs;	/* Cache: number of allocations. */
 	uint64_t	ups_frees;	/* Cache: number of frees. */
 	uint64_t	ups_cache_free;	/* Cache: free items in cache. */
 	uint64_t	_ups_reserved[5];	/* Reserved. */
 };
 
 void uma_reclaim_wakeup(void);
 void uma_reclaim_worker(void *);
 
 unsigned long uma_limit(void);
 
 /* Return the amount of memory managed by UMA. */
 unsigned long uma_size(void);
 
 /* Return the amount of memory remaining.  May be negative. */
 long uma_avail(void);
 
 #endif	/* _VM_UMA_H_ */
Index: head/sys/vm/uma_core.c
===================================================================
--- head/sys/vm/uma_core.c	(revision 350658)
+++ head/sys/vm/uma_core.c	(revision 350659)
@@ -1,4232 +1,4328 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * uma_core.c  Implementation of the Universal Memory allocator
  *
  * This allocator is intended to replace the multitude of similar object caches
  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  * efficient.  A primary design goal is to return unused memory to the rest of
  * the system.  This will make the system as a whole more flexible due to the
  * ability to move memory to subsystems which most need it instead of leaving
  * pools of reserved memory unused.
  *
  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  * are well known.
  *
  */
 
 /*
  * TODO:
  *	- Improve memory usage for large allocations
  *	- Investigate cache size adjustments
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_param.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bitset.h>
 #include <sys/domainset.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/limits.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_domainset.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #include <ddb/ddb.h>
 
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
 
 /*
  * This is the zone and keg from which all zones are spawned.
  */
 static uma_zone_t kegs;
 static uma_zone_t zones;
 
 /* This is the zone from which all offpage uma_slab_ts are allocated. */
 static uma_zone_t slabzone;
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
  * prior to malloc coming up.
  */
 static uma_zone_t hashzone;
 
 /* The boot-time adjusted value for cache line alignment. */
 int uma_align_cache = 64 - 1;
 
 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 
 /*
  * Are we allowed to allocate buckets?
  */
 static int bucketdisable = 1;
 
 /* Linked list of all kegs in the system */
 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 
 /* Linked list of all cache-only zones in the system */
 static LIST_HEAD(,uma_zone) uma_cachezones =
     LIST_HEAD_INITIALIZER(uma_cachezones);
 
 /* This RW lock protects the keg list */
 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 
 /*
  * Pointer and counter to pool of pages, that is preallocated at
  * startup to bootstrap UMA.
  */
 static char *bootmem;
 static int boot_pages;
 
 static struct sx uma_drain_lock;
 
 /*
  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
  * allocations don't trigger a wakeup of the reclaim thread.
  */
 static unsigned long uma_kmem_limit = LONG_MAX;
 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
     "UMA kernel memory soft limit");
 static unsigned long uma_kmem_total;
 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
     "UMA kernel memory usage");
 
 /* Is the VM done starting up? */
 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
     BOOT_RUNNING } booted = BOOT_COLD;
 
 /*
  * This is the handle used to schedule events that need to happen
  * outside of the allocation fast path.
  */
 static struct callout uma_callout;
 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
 
 /*
  * This structure is passed as the zone ctor arg so that I don't have to create
  * a special allocation function just for zones.
  */
 struct uma_zctor_args {
 	const char *name;
 	size_t size;
 	uma_ctor ctor;
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
 	uma_import import;
 	uma_release release;
 	void *arg;
 	uma_keg_t keg;
 	int align;
 	uint32_t flags;
 };
 
 struct uma_kctor_args {
 	uma_zone_t zone;
 	size_t size;
 	uma_init uminit;
 	uma_fini fini;
 	int align;
 	uint32_t flags;
 };
 
 struct uma_bucket_zone {
 	uma_zone_t	ubz_zone;
 	char		*ubz_name;
 	int		ubz_entries;	/* Number of items it can hold. */
 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
 };
 
 /*
  * Compute the actual number of bucket entries to pack them in power
  * of two sizes for more efficient space utilization.
  */
 #define	BUCKET_SIZE(n)						\
     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
 
 #define	BUCKET_MAX	BUCKET_SIZE(256)
 
 struct uma_bucket_zone bucket_zones[] = {
 	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
 	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
 	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
 	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
 	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
 	{ NULL, NULL, 0}
 };
 
 /*
  * Flags and enumerations to be passed to internal functions.
  */
 enum zfreeskip {
 	SKIP_NONE =	0,
 	SKIP_CNT =	0x00000001,
 	SKIP_DTOR =	0x00010000,
 	SKIP_FINI =	0x00020000,
 };
 
 #define	UMA_ANYDOMAIN	-1	/* Special value for domain search. */
 
 /* Prototypes.. */
 
 int	uma_startup_count(int);
 void	uma_startup(void *, int);
 void	uma_startup1(void);
 void	uma_startup2(void);
 
 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void page_free(void *, vm_size_t, uint8_t);
 static void pcpu_page_free(void *, vm_size_t, uint8_t);
 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
 static int keg_ctor(void *, int, void *, int);
 static void keg_dtor(void *, int, void *);
 static int zone_ctor(void *, int, void *, int);
 static void zone_dtor(void *, int, void *);
 static int zero_init(void *, int, int);
 static void keg_small_init(uma_keg_t keg);
 static void keg_large_init(uma_keg_t keg);
 static void zone_foreach(void (*zfunc)(uma_zone_t));
 static void zone_timeout(uma_zone_t zone);
 static int hash_alloc(struct uma_hash *, u_int);
 static int hash_expand(struct uma_hash *, struct uma_hash *);
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
 static void *zone_alloc_item(uma_zone_t, void *, int, int);
 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 static void bucket_zone_drain(void);
 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int, int);
 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, uint32_t flags);
 static int zone_import(uma_zone_t, void **, int, int, int);
 static void zone_release(uma_zone_t, void **, int);
 static void uma_zero_item(void *, uma_zone_t);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 
 #ifdef INVARIANTS
 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
 
 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
     "Memory allocation debugging");
 
 static u_int dbg_divisor = 1;
 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
     "Debug & thrash every this item in memory allocator");
 
 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
     &uma_dbg_cnt, "memory items debugged");
 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
     &uma_skip_cnt, "memory items skipped, not debugged");
 #endif
 
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
 static int zone_warnings = 1;
 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
     "Warn when UMA zones becomes full");
 
 /* Adjust bytes under management by UMA. */
 static inline void
 uma_total_dec(unsigned long size)
 {
 
 	atomic_subtract_long(&uma_kmem_total, size);
 }
 
 static inline void
 uma_total_inc(unsigned long size)
 {
 
 	if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
 		uma_reclaim_wakeup();
 }
 
 /*
  * This routine checks to see whether or not it's safe to enable buckets.
  */
 static void
 bucket_enable(void)
 {
 	bucketdisable = vm_page_count_min();
 }
 
 /*
  * Initialize bucket_zones, the array of zones of buckets of various sizes.
  *
  * For each zone, calculate the memory required for each bucket, consisting
  * of the header and an array of pointers.
  */
 static void
 bucket_init(void)
 {
 	struct uma_bucket_zone *ubz;
 	int size;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
 	}
 }
 
 /*
  * Given a desired number of entries for a bucket, return the zone from which
  * to allocate the bucket.
  */
 static struct uma_bucket_zone *
 bucket_zone_lookup(int entries)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		if (ubz->ubz_entries >= entries)
 			return (ubz);
 	ubz--;
 	return (ubz);
 }
 
 static int
 bucket_select(int size)
 {
 	struct uma_bucket_zone *ubz;
 
 	ubz = &bucket_zones[0];
 	if (size > ubz->ubz_maxsize)
 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
 
 	for (; ubz->ubz_entries != 0; ubz++)
 		if (ubz->ubz_maxsize < size)
 			break;
 	ubz--;
 	return (ubz->ubz_entries);
 }
 
 static uma_bucket_t
 bucket_alloc(uma_zone_t zone, void *udata, int flags)
 {
 	struct uma_bucket_zone *ubz;
 	uma_bucket_t bucket;
 
 	/*
 	 * This is to stop us from allocating per cpu buckets while we're
 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
 	 * boot pages.  This also prevents us from allocating buckets in
 	 * low memory situations.
 	 */
 	if (bucketdisable)
 		return (NULL);
 	/*
 	 * To limit bucket recursion we store the original zone flags
 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
 	 * NOVM flag to persist even through deep recursions.  We also
 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
 	 * a bucket for a bucket zone so we do not allow infinite bucket
 	 * recursion.  This cookie will even persist to frees of unused
 	 * buckets via the allocation path or bucket allocations in the
 	 * free path.
 	 */
 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 		udata = (void *)(uintptr_t)zone->uz_flags;
 	else {
 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
 			return (NULL);
 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
 	}
 	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
 		flags |= M_NOVM;
 	ubz = bucket_zone_lookup(zone->uz_count);
 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
 		ubz++;
 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
 	if (bucket) {
 #ifdef INVARIANTS
 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 #endif
 		bucket->ub_cnt = 0;
 		bucket->ub_entries = ubz->ubz_entries;
 	}
 
 	return (bucket);
 }
 
 static void
 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 {
 	struct uma_bucket_zone *ubz;
 
 	KASSERT(bucket->ub_cnt == 0,
 	    ("bucket_free: Freeing a non free bucket."));
 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 		udata = (void *)(uintptr_t)zone->uz_flags;
 	ubz = bucket_zone_lookup(bucket->ub_entries);
 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
 }
 
 static void
 bucket_zone_drain(void)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		zone_drain(ubz->ubz_zone);
 }
 
 static uma_bucket_t
 zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws)
 {
 	uma_bucket_t bucket;
 
 	ZONE_LOCK_ASSERT(zone);
 
 	if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
 		MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
 		LIST_REMOVE(bucket, ub_link);
 		zdom->uzd_nitems -= bucket->ub_cnt;
 		if (ws && zdom->uzd_imin > zdom->uzd_nitems)
 			zdom->uzd_imin = zdom->uzd_nitems;
 		zone->uz_bkt_count -= bucket->ub_cnt;
 	}
 	return (bucket);
 }
 
 static void
 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
     const bool ws)
 {
 
 	ZONE_LOCK_ASSERT(zone);
 	KASSERT(zone->uz_bkt_count < zone->uz_bkt_max, ("%s: zone %p overflow",
 	    __func__, zone));
 
 	LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
 	zdom->uzd_nitems += bucket->ub_cnt;
 	if (ws && zdom->uzd_imax < zdom->uzd_nitems)
 		zdom->uzd_imax = zdom->uzd_nitems;
 	zone->uz_bkt_count += bucket->ub_cnt;
 }
 
 static void
 zone_log_warning(uma_zone_t zone)
 {
 	static const struct timeval warninterval = { 300, 0 };
 
 	if (!zone_warnings || zone->uz_warning == NULL)
 		return;
 
 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 }
 
 static inline void
 zone_maxaction(uma_zone_t zone)
 {
 
 	if (zone->uz_maxaction.ta_func != NULL)
 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
 }
 
 /*
  * Routine called by timeout which is used to fire off some time interval
  * based calculations.  (stats, hash size, etc.)
  *
  * Arguments:
  *	arg   Unused
  *
  * Returns:
  *	Nothing
  */
 static void
 uma_timeout(void *unused)
 {
 	bucket_enable();
 	zone_foreach(zone_timeout);
 
 	/* Reschedule this event */
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 }
 
 /*
  * Update the working set size estimate for the zone's bucket cache.
  * The constants chosen here are somewhat arbitrary.  With an update period of
  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
  * last 100s.
  */
 static void
 zone_domain_update_wss(uma_zone_domain_t zdom)
 {
 	long wss;
 
 	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
 	wss = zdom->uzd_imax - zdom->uzd_imin;
 	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
 	zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5;
 }
 
 /*
  * Routine to perform timeout driven calculations.  This expands the
  * hashes and does per cpu statistics aggregation.
  *
  *  Returns nothing.
  */
 static void
 zone_timeout(uma_zone_t zone)
 {
 	uma_keg_t keg = zone->uz_keg;
 	u_int slabs;
 
 	KEG_LOCK(keg);
 	/*
 	 * Expand the keg hash table.
 	 *
 	 * This is done if the number of slabs is larger than the hash size.
 	 * What I'm trying to do here is completely reduce collisions.  This
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 	if (keg->uk_flags & UMA_ZONE_HASH &&
 	    (slabs = keg->uk_pages / keg->uk_ppera) >
 	     keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
 
 		/*
 		 * This is so involved because allocating and freeing
 		 * while the keg lock is held will lead to deadlock.
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
 		KEG_UNLOCK(keg);
 		ret = hash_alloc(&newhash, 1 << fls(slabs));
 		KEG_LOCK(keg);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
 				oldhash = keg->uk_hash;
 				keg->uk_hash = newhash;
 			} else
 				oldhash = newhash;
 
 			KEG_UNLOCK(keg);
 			hash_free(&oldhash);
 			return;
 		}
 	}
 
 	for (int i = 0; i < vm_ndomains; i++)
 		zone_domain_update_wss(&zone->uz_domain[i]);
 
 	KEG_UNLOCK(keg);
 }
 
 /*
  * Allocate and zero fill the next sized hash table from the appropriate
  * backing store.
  *
  * Arguments:
  *	hash  A new hash structure with the old hash size in uh_hashsize
  *
  * Returns:
  *	1 on success and 0 on failure.
  */
 static int
 hash_alloc(struct uma_hash *hash, u_int size)
 {
 	size_t alloc;
 
 	KASSERT(powerof2(size), ("hash size must be power of 2"));
 	if (size > UMA_HASH_SIZE_INIT)  {
 		hash->uh_hashsize = size;
 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 		    M_UMAHASH, M_NOWAIT);
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 		    UMA_ANYDOMAIN, M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
 		bzero(hash->uh_slab_hash, alloc);
 		hash->uh_hashmask = hash->uh_hashsize - 1;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Expands the hash table for HASH zones.  This is done from zone_timeout
  * to reduce collisions.  This must not be done in the regular allocation
  * path, otherwise, we can recurse on the vm while allocating pages.
  *
  * Arguments:
  *	oldhash  The hash you want to expand
  *	newhash  The hash structure for the new table
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  */
 static int
 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 {
 	uma_slab_t slab;
 	u_int hval;
 	u_int idx;
 
 	if (!newhash->uh_slab_hash)
 		return (0);
 
 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 		return (0);
 
 	/*
 	 * I need to investigate hash algorithms for resizing without a
 	 * full rehash.
 	 */
 
 	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
 			hval = UMA_HASH(newhash, slab->us_data);
 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 			    slab, us_hlink);
 		}
 
 	return (1);
 }
 
 /*
  * Free the hash bucket to the appropriate backing store.
  *
  * Arguments:
  *	slab_hash  The hash bucket we're freeing
  *	hashsize   The number of entries in that hash bucket
  *
  * Returns:
  *	Nothing
  */
 static void
 hash_free(struct uma_hash *hash)
 {
 	if (hash->uh_slab_hash == NULL)
 		return;
 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
 	else
 		free(hash->uh_slab_hash, M_UMAHASH);
 }
 
 /*
  * Frees all outstanding items in a bucket
  *
  * Arguments:
  *	zone   The zone to free to, must be unlocked.
  *	bucket The free/alloc bucket with items, cpu queue must be locked.
  *
  * Returns:
  *	Nothing
  */
 
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
 	int i;
 
 	if (bucket == NULL)
 		return;
 
 	if (zone->uz_fini)
 		for (i = 0; i < bucket->ub_cnt; i++) 
 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
 	if (zone->uz_max_items > 0) {
 		ZONE_LOCK(zone);
 		zone->uz_items -= bucket->ub_cnt;
 		if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items)
 			wakeup_one(zone);
 		ZONE_UNLOCK(zone);
 	}
 	bucket->ub_cnt = 0;
 }
 
 /*
  * Drains the per cpu caches for a zone.
  *
  * NOTE: This may only be called while the zone is being turn down, and not
  * during normal operation.  This is necessary in order that we do not have
  * to migrate CPUs to drain the per-CPU caches.
  *
  * Arguments:
  *	zone     The zone to drain, must be unlocked.
  *
  * Returns:
  *	Nothing
  */
 static void
 cache_drain(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	int cpu;
 
 	/*
 	 * XXX: It is safe to not lock the per-CPU caches, because we're
 	 * tearing down the zone anyway.  I.e., there will be no further use
 	 * of the caches at this point.
 	 *
 	 * XXX: It would good to be able to assert that the zone is being
 	 * torn down to prevent improper use of cache_drain().
 	 *
 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
 	 * it is used elsewhere.  Should the tear-down path be made special
 	 * there in some form?
 	 */
 	CPU_FOREACH(cpu) {
 		cache = &zone->uz_cpu[cpu];
 		bucket_drain(zone, cache->uc_allocbucket);
-		bucket_drain(zone, cache->uc_freebucket);
 		if (cache->uc_allocbucket != NULL)
 			bucket_free(zone, cache->uc_allocbucket, NULL);
+		cache->uc_allocbucket = NULL;
+		bucket_drain(zone, cache->uc_freebucket);
 		if (cache->uc_freebucket != NULL)
 			bucket_free(zone, cache->uc_freebucket, NULL);
-		cache->uc_allocbucket = cache->uc_freebucket = NULL;
+		cache->uc_freebucket = NULL;
+		bucket_drain(zone, cache->uc_crossbucket);
+		if (cache->uc_crossbucket != NULL)
+			bucket_free(zone, cache->uc_crossbucket, NULL);
+		cache->uc_crossbucket = NULL;
 	}
 	ZONE_LOCK(zone);
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 }
 
 static void
 cache_shrink(uma_zone_t zone)
 {
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
 	ZONE_LOCK(zone);
 	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
 	ZONE_UNLOCK(zone);
 }
 
 static void
 cache_drain_safe_cpu(uma_zone_t zone)
 {
 	uma_cache_t cache;
-	uma_bucket_t b1, b2;
+	uma_bucket_t b1, b2, b3;
 	int domain;
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
-	b1 = b2 = NULL;
+	b1 = b2 = b3 = NULL;
 	ZONE_LOCK(zone);
 	critical_enter();
 	if (zone->uz_flags & UMA_ZONE_NUMA)
 		domain = PCPU_GET(domain);
 	else
 		domain = 0;
 	cache = &zone->uz_cpu[curcpu];
 	if (cache->uc_allocbucket) {
 		if (cache->uc_allocbucket->ub_cnt != 0)
 			zone_put_bucket(zone, &zone->uz_domain[domain],
 			    cache->uc_allocbucket, false);
 		else
 			b1 = cache->uc_allocbucket;
 		cache->uc_allocbucket = NULL;
 	}
 	if (cache->uc_freebucket) {
 		if (cache->uc_freebucket->ub_cnt != 0)
 			zone_put_bucket(zone, &zone->uz_domain[domain],
 			    cache->uc_freebucket, false);
 		else
 			b2 = cache->uc_freebucket;
 		cache->uc_freebucket = NULL;
 	}
+	b3 = cache->uc_crossbucket;
+	cache->uc_crossbucket = NULL;
 	critical_exit();
 	ZONE_UNLOCK(zone);
 	if (b1)
 		bucket_free(zone, b1, NULL);
 	if (b2)
 		bucket_free(zone, b2, NULL);
+	if (b3) {
+		bucket_drain(zone, b3);
+		bucket_free(zone, b3, NULL);
+	}
 }
 
 /*
  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
  * This is an expensive call because it needs to bind to all CPUs
  * one by one and enter a critical section on each of them in order
  * to safely access their cache buckets.
  * Zone lock must not be held on call this function.
  */
 static void
 cache_drain_safe(uma_zone_t zone)
 {
 	int cpu;
 
 	/*
 	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
 	 */
 	if (zone)
 		cache_shrink(zone);
 	else
 		zone_foreach(cache_shrink);
 
 	CPU_FOREACH(cpu) {
 		thread_lock(curthread);
 		sched_bind(curthread, cpu);
 		thread_unlock(curthread);
 
 		if (zone)
 			cache_drain_safe_cpu(zone);
 		else
 			zone_foreach(cache_drain_safe_cpu);
 	}
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 }
 
 /*
  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
  */
 static void
 bucket_cache_drain(uma_zone_t zone)
 {
 	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
 	int i;
 
 	/*
 	 * Drain the bucket queues and free the buckets.
 	 */
 	for (i = 0; i < vm_ndomains; i++) {
 		zdom = &zone->uz_domain[i];
 		while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) !=
 		    NULL) {
 			ZONE_UNLOCK(zone);
 			bucket_drain(zone, bucket);
 			bucket_free(zone, bucket, NULL);
 			ZONE_LOCK(zone);
 		}
 	}
 
 	/*
 	 * Shrink further bucket sizes.  Price of single zone lock collision
 	 * is probably lower then price of global cache drain.
 	 */
 	if (zone->uz_count > zone->uz_count_min)
 		zone->uz_count--;
 }
 
 static void
 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
 {
 	uint8_t *mem;
 	int i;
 	uint8_t flags;
 
 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
 
 	mem = slab->us_data;
 	flags = slab->us_flags;
 	i = start;
 	if (keg->uk_fini != NULL) {
 		for (i--; i > -1; i--)
 #ifdef INVARIANTS
 		/*
 		 * trash_fini implies that dtor was trash_dtor. trash_fini
 		 * would check that memory hasn't been modified since free,
 		 * which executed trash_dtor.
 		 * That's why we need to run uma_dbg_kskip() check here,
 		 * albeit we don't make skip check for other init/fini
 		 * invocations.
 		 */
 		if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
 		    keg->uk_fini != trash_fini)
 #endif
 			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
 			    keg->uk_size);
 	}
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
 	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
 	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
 }
 
 /*
  * Frees pages from a keg back to the system.  This is done on demand from
  * the pageout daemon.
  *
  * Returns nothing.
  */
 static void
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
 	uma_domain_t dom;
 	uma_slab_t slab, tmp;
 	int i;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
 	 * time
 	 */
 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 		return;
 
 	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
 	    keg->uk_name, keg, keg->uk_free);
 	KEG_LOCK(keg);
 	if (keg->uk_free == 0)
 		goto finished;
 
 	for (i = 0; i < vm_ndomains; i++) {
 		dom = &keg->uk_domain[i];
 		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
 			/* We have nowhere to free these to. */
 			if (slab->us_flags & UMA_SLAB_BOOT)
 				continue;
 
 			LIST_REMOVE(slab, us_link);
 			keg->uk_pages -= keg->uk_ppera;
 			keg->uk_free -= keg->uk_ipers;
 
 			if (keg->uk_flags & UMA_ZONE_HASH)
 				UMA_HASH_REMOVE(&keg->uk_hash, slab,
 				    slab->us_data);
 
 			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 		}
 	}
 
 finished:
 	KEG_UNLOCK(keg);
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 		keg_free_slab(keg, slab, keg->uk_ipers);
 	}
 }
 
 static void
 zone_drain_wait(uma_zone_t zone, int waitok)
 {
 
 	/*
 	 * Set draining to interlock with zone_dtor() so we can release our
 	 * locks as we go.  Only dtor() should do a WAITOK call since it
 	 * is the only call that knows the structure will still be available
 	 * when it wakes up.
 	 */
 	ZONE_LOCK(zone);
 	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 		if (waitok == M_NOWAIT)
 			goto out;
 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
 	}
 	zone->uz_flags |= UMA_ZFLAG_DRAINING;
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 	/*
 	 * The DRAINING flag protects us from being freed while
 	 * we're running.  Normally the uma_rwlock would protect us but we
 	 * must be able to release and acquire the right lock for each keg.
 	 */
 	keg_drain(zone->uz_keg);
 	ZONE_LOCK(zone);
 	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 	wakeup(zone);
 out:
 	ZONE_UNLOCK(zone);
 }
 
 void
 zone_drain(uma_zone_t zone)
 {
 
 	zone_drain_wait(zone, M_NOWAIT);
 }
 
 /*
  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
  * If the allocation was successful, the keg lock will be held upon return,
  * otherwise the keg will be left unlocked.
  *
  * Arguments:
  *	flags   Wait flags for the item initialization routine
  *	aflags  Wait flags for the slab allocation
  *
  * Returns:
  *	The slab that was allocated or NULL if there is no memory and the
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
     int aflags)
 {
 	uma_alloc allocf;
 	uma_slab_t slab;
 	unsigned long size;
 	uint8_t *mem;
 	uint8_t sflags;
 	int i;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("keg_alloc_slab: domain %d out of range", domain));
 	KEG_LOCK_ASSERT(keg);
 	MPASS(zone->uz_lockptr == &keg->uk_lock);
 
 	allocf = keg->uk_allocf;
 	KEG_UNLOCK(keg);
 
 	slab = NULL;
 	mem = NULL;
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags);
 		if (slab == NULL)
 			goto out;
 	}
 
 	/*
 	 * This reproduces the old vm_zone behavior of zero filling pages the
 	 * first time they are added to a zone.
 	 *
 	 * Malloced items are zeroed in uma_zalloc.
 	 */
 
 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		aflags |= M_ZERO;
 	else
 		aflags &= ~M_ZERO;
 
 	if (keg->uk_flags & UMA_ZONE_NODUMP)
 		aflags |= M_NODUMP;
 
 	/* zone is passed for legacy reasons. */
 	size = keg->uk_ppera * PAGE_SIZE;
 	mem = allocf(zone, size, domain, &sflags, aflags);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
 		slab = NULL;
 		goto out;
 	}
 	uma_total_inc(size);
 
 	/* Point the slab into the allocated memory */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
 
 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
 		for (i = 0; i < keg->uk_ppera; i++)
 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 
 	slab->us_keg = keg;
 	slab->us_data = mem;
 	slab->us_freecount = keg->uk_ipers;
 	slab->us_flags = sflags;
 	slab->us_domain = domain;
 	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
 #ifdef INVARIANTS
 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
 #endif
 
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
 			    keg->uk_size, flags) != 0)
 				break;
 		if (i != keg->uk_ipers) {
 			keg_free_slab(keg, slab, i);
 			slab = NULL;
 			goto out;
 		}
 	}
 	KEG_LOCK(keg);
 
 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
 	    slab, keg->uk_name, keg);
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
 	keg->uk_pages += keg->uk_ppera;
 	keg->uk_free += keg->uk_ipers;
 
 out:
 	return (slab);
 }
 
 /*
  * This function is intended to be used early on in place of page_alloc() so
  * that we may use the boot time page cache to satisfy allocations before
  * the VM is ready.
  */
 static void *
 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	uma_keg_t keg;
 	void *mem;
 	int pages;
 
 	keg = zone->uz_keg;
 	/*
 	 * If we are in BOOT_BUCKETS or higher, than switch to real
 	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
 	 */
 	switch (booted) {
 		case BOOT_COLD:
 		case BOOT_STRAPPED:
 			break;
 		case BOOT_PAGEALLOC:
 			if (keg->uk_ppera > 1)
 				break;
 		case BOOT_BUCKETS:
 		case BOOT_RUNNING:
 #ifdef UMA_MD_SMALL_ALLOC
 			keg->uk_allocf = (keg->uk_ppera > 1) ?
 			    page_alloc : uma_small_alloc;
 #else
 			keg->uk_allocf = page_alloc;
 #endif
 			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
 	}
 
 	/*
 	 * Check our small startup cache to see if it has pages remaining.
 	 */
 	pages = howmany(bytes, PAGE_SIZE);
 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
 	if (pages > boot_pages)
 		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
 #ifdef DIAGNOSTIC
 	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
 	    boot_pages);
 #endif
 	mem = bootmem;
 	boot_pages -= pages;
 	bootmem += pages * PAGE_SIZE;
 	*pflag = UMA_SLAB_BOOT;
 
 	return (mem);
 }
 
 /*
  * Allocates a number of pages from the system
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait  Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	void *p;	/* Returned page */
 
 	*pflag = UMA_SLAB_KERNEL;
 	p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
 
 	return (p);
 }
 
 static void *
 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	struct pglist alloctail;
 	vm_offset_t addr, zkva;
 	int cpu, flags;
 	vm_page_t p, p_next;
 #ifdef NUMA
 	struct pcpu *pc;
 #endif
 
 	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
 
 	TAILQ_INIT(&alloctail);
 	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
 	    malloc2vm_flags(wait);
 	*pflag = UMA_SLAB_KERNEL;
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (CPU_ABSENT(cpu)) {
 			p = vm_page_alloc(NULL, 0, flags);
 		} else {
 #ifndef NUMA
 			p = vm_page_alloc(NULL, 0, flags);
 #else
 			pc = pcpu_find(cpu);
 			p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
 			if (__predict_false(p == NULL))
 				p = vm_page_alloc(NULL, 0, flags);
 #endif
 		}
 		if (__predict_false(p == NULL))
 			goto fail;
 		TAILQ_INSERT_TAIL(&alloctail, p, listq);
 	}
 	if ((addr = kva_alloc(bytes)) == 0)
 		goto fail;
 	zkva = addr;
 	TAILQ_FOREACH(p, &alloctail, listq) {
 		pmap_qenter(zkva, &p, 1);
 		zkva += PAGE_SIZE;
 	}
 	return ((void*)addr);
 fail:
 	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
 		vm_page_unwire_noq(p);
 		vm_page_free(p);
 	}
 	return (NULL);
 }
 
 /*
  * Allocates a number of pages from within an object
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait   Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
     int wait)
 {
 	TAILQ_HEAD(, vm_page) alloctail;
 	u_long npages;
 	vm_offset_t retkva, zkva;
 	vm_page_t p, p_next;
 	uma_keg_t keg;
 
 	TAILQ_INIT(&alloctail);
 	keg = zone->uz_keg;
 
 	npages = howmany(bytes, PAGE_SIZE);
 	while (npages > 0) {
 		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
 		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
 		    VM_ALLOC_NOWAIT));
 		if (p != NULL) {
 			/*
 			 * Since the page does not belong to an object, its
 			 * listq is unused.
 			 */
 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
 			npages--;
 			continue;
 		}
 		/*
 		 * Page allocation failed, free intermediate pages and
 		 * exit.
 		 */
 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
 			vm_page_unwire_noq(p);
 			vm_page_free(p); 
 		}
 		return (NULL);
 	}
 	*flags = UMA_SLAB_PRIV;
 	zkva = keg->uk_kva +
 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
 	retkva = zkva;
 	TAILQ_FOREACH(p, &alloctail, listq) {
 		pmap_qenter(zkva, &p, 1);
 		zkva += PAGE_SIZE;
 	}
 
 	return ((void *)retkva);
 }
 
 /*
  * Frees a number of pages to the system
  *
  * Arguments:
  *	mem   A pointer to the memory to be freed
  *	size  The size of the memory being freed
  *	flags The original p->us_flags field
  *
  * Returns:
  *	Nothing
  */
 static void
 page_free(void *mem, vm_size_t size, uint8_t flags)
 {
 
 	if ((flags & UMA_SLAB_KERNEL) == 0)
 		panic("UMA: page_free used with invalid flags %x", flags);
 
 	kmem_free((vm_offset_t)mem, size);
 }
 
 /*
  * Frees pcpu zone allocations
  *
  * Arguments:
  *	mem   A pointer to the memory to be freed
  *	size  The size of the memory being freed
  *	flags The original p->us_flags field
  *
  * Returns:
  *	Nothing
  */
 static void
 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
 {
 	vm_offset_t sva, curva;
 	vm_paddr_t paddr;
 	vm_page_t m;
 
 	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
 	sva = (vm_offset_t)mem;
 	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
 		paddr = pmap_kextract(curva);
 		m = PHYS_TO_VM_PAGE(paddr);
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
 	pmap_qremove(sva, size >> PAGE_SHIFT);
 	kva_free(sva, size);
 }
 
 
 /*
  * Zero fill initializer
  *
  * Arguments/Returns follow uma_init specifications
  */
 static int
 zero_init(void *mem, int size, int flags)
 {
 	bzero(mem, size);
 	return (0);
 }
 
 /*
  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
  *
  * Arguments
  *	keg  The zone we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_small_init(uma_keg_t keg)
 {
 	u_int rsize;
 	u_int memused;
 	u_int wastedspace;
 	u_int shsize;
 	u_int slabsize;
 
 	if (keg->uk_flags & UMA_ZONE_PCPU) {
 		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
 
 		slabsize = UMA_PCPU_ALLOC_SIZE;
 		keg->uk_ppera = ncpus;
 	} else {
 		slabsize = UMA_SLAB_SIZE;
 		keg->uk_ppera = 1;
 	}
 
 	/*
 	 * Calculate the size of each allocation (rsize) according to
 	 * alignment.  If the requested size is smaller than we have
 	 * allocation bits for we round it up.
 	 */
 	rsize = keg->uk_size;
 	if (rsize < slabsize / SLAB_SETSIZE)
 		rsize = slabsize / SLAB_SETSIZE;
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
 	keg->uk_rsize = rsize;
 
 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
 	    keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
 	    ("%s: size %u too large", __func__, keg->uk_rsize));
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 		shsize = 0;
 	else 
 		shsize = SIZEOF_UMA_SLAB;
 
 	if (rsize <= slabsize - shsize)
 		keg->uk_ipers = (slabsize - shsize) / rsize;
 	else {
 		/* Handle special case when we have 1 item per slab, so
 		 * alignment requirement can be relaxed. */
 		KASSERT(keg->uk_size <= slabsize - shsize,
 		    ("%s: size %u greater than slab", __func__, keg->uk_size));
 		keg->uk_ipers = 1;
 	}
 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
 
 	memused = keg->uk_ipers * rsize + shsize;
 	wastedspace = slabsize - memused;
 
 	/*
 	 * We can't do OFFPAGE if we're internal or if we've been
 	 * asked to not go to the VM for buckets.  If we do this we
 	 * may end up going to the VM  for slabs which we do not
 	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
 	 * of UMA_ZONE_VM, which clearly forbids it.
 	 */
 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
 		return;
 
 	/*
 	 * See if using an OFFPAGE slab will limit our waste.  Only do
 	 * this if it permits more items per-slab.
 	 *
 	 * XXX We could try growing slabsize to limit max waste as well.
 	 * Historically this was not done because the VM could not
 	 * efficiently handle contiguous allocations.
 	 */
 	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
 	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
 		keg->uk_ipers = slabsize / keg->uk_rsize;
 		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
 		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
 		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
 		    "keg: %s(%p), calculated wastedspace = %d, "
 		    "maximum wasted space allowed = %d, "
 		    "calculated ipers = %d, "
 		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
 		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
 		    slabsize - keg->uk_ipers * keg->uk_rsize);
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
 	}
 
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 }
 
 /*
  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
  * more complicated.
  *
  * Arguments
  *	keg  The keg we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_large_init(uma_keg_t keg)
 {
 
 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
 	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
 
 	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
 	keg->uk_ipers = 1;
 	keg->uk_rsize = keg->uk_size;
 
 	/* Check whether we have enough space to not do OFFPAGE. */
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 &&
 	    PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < SIZEOF_UMA_SLAB) {
 		/*
 		 * We can't do OFFPAGE if we're internal, in which case
 		 * we need an extra page per allocation to contain the
 		 * slab header.
 		 */
 		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
 			keg->uk_flags |= UMA_ZONE_OFFPAGE;
 		else
 			keg->uk_ppera++;
 	}
 
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 }
 
 static void
 keg_cachespread_init(uma_keg_t keg)
 {
 	int alignsize;
 	int trailer;
 	int pages;
 	int rsize;
 
 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
 	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
 
 	alignsize = keg->uk_align + 1;
 	rsize = keg->uk_size;
 	/*
 	 * We want one item to start on every align boundary in a page.  To
 	 * do this we will span pages.  We will also extend the item by the
 	 * size of align if it is an even multiple of align.  Otherwise, it
 	 * would fall on the same boundary every time.
 	 */
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + alignsize;
 	if ((rsize & alignsize) == 0)
 		rsize += alignsize;
 	trailer = rsize - keg->uk_size;
 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
 	keg->uk_rsize = rsize;
 	keg->uk_ppera = pages;
 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
 	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
 	    keg->uk_ipers));
 }
 
 /*
  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
  * the keg onto the global keg list.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_kctor_args
  */
 static int
 keg_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_kctor_args *arg = udata;
 	uma_keg_t keg = mem;
 	uma_zone_t zone;
 
 	bzero(keg, size);
 	keg->uk_size = arg->size;
 	keg->uk_init = arg->uminit;
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
 	keg->uk_free = 0;
 	keg->uk_reserve = 0;
 	keg->uk_pages = 0;
 	keg->uk_flags = arg->flags;
 	keg->uk_slabzone = NULL;
 
 	/*
 	 * We use a global round-robin policy by default.  Zones with
 	 * UMA_ZONE_NUMA set will use first-touch instead, in which case the
 	 * iterator is never run.
 	 */
 	keg->uk_dr.dr_policy = DOMAINSET_RR();
 	keg->uk_dr.dr_iter = 0;
 
 	/*
 	 * The master zone is passed to us at keg-creation time.
 	 */
 	zone = arg->zone;
 	keg->uk_name = zone->uz_name;
 
 	if (arg->flags & UMA_ZONE_VM)
 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
 
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
 	if (arg->flags & UMA_ZONE_MALLOC)
 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
 
 	if (arg->flags & UMA_ZONE_PCPU)
 #ifdef SMP
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
 #else
 		keg->uk_flags &= ~UMA_ZONE_PCPU;
 #endif
 
 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
 		keg_cachespread_init(keg);
 	} else {
 		if (keg->uk_size > UMA_SLAB_SPACE)
 			keg_large_init(keg);
 		else
 			keg_small_init(keg);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 		keg->uk_slabzone = slabzone;
 
 	/*
 	 * If we haven't booted yet we need allocations to go through the
 	 * startup cache until the vm is ready.
 	 */
 	if (booted < BOOT_PAGEALLOC)
 		keg->uk_allocf = startup_alloc;
 #ifdef UMA_MD_SMALL_ALLOC
 	else if (keg->uk_ppera == 1)
 		keg->uk_allocf = uma_small_alloc;
 #endif
 	else if (keg->uk_flags & UMA_ZONE_PCPU)
 		keg->uk_allocf = pcpu_page_alloc;
 	else
 		keg->uk_allocf = page_alloc;
 #ifdef UMA_MD_SMALL_ALLOC
 	if (keg->uk_ppera == 1)
 		keg->uk_freef = uma_small_free;
 	else
 #endif
 	if (keg->uk_flags & UMA_ZONE_PCPU)
 		keg->uk_freef = pcpu_page_free;
 	else
 		keg->uk_freef = page_free;
 
 	/*
 	 * Initialize keg's lock
 	 */
 	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
 	 * figure out where in each page it goes.  See SIZEOF_UMA_SLAB
 	 * macro definition.
 	 */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - SIZEOF_UMA_SLAB;
 		/*
 		 * The only way the following is possible is if with our
 		 * UMA_ALIGN_PTR adjustments we are now bigger than
 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
 		 * mathematically possible for all cases, so we make
 		 * sure here anyway.
 		 */
 		KASSERT(keg->uk_pgoff + sizeof(struct uma_slab) <=
 		    PAGE_SIZE * keg->uk_ppera,
 		    ("zone %s ipers %d rsize %d size %d slab won't fit",
 		    zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
 	}
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		hash_alloc(&keg->uk_hash, 0);
 
 	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
 	    keg, zone->uz_name, zone,
 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
 	    keg->uk_free);
 
 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
 	rw_wlock(&uma_rwlock);
 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 	rw_wunlock(&uma_rwlock);
 	return (0);
 }
 
 static void
 zone_alloc_counters(uma_zone_t zone)
 {
 
 	zone->uz_allocs = counter_u64_alloc(M_WAITOK);
 	zone->uz_frees = counter_u64_alloc(M_WAITOK);
 	zone->uz_fails = counter_u64_alloc(M_WAITOK);
 }
 
 /*
  * Zone header ctor.  This initializes all fields, locks, etc.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_zctor_args
  */
 static int
 zone_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_zctor_args *arg = udata;
 	uma_zone_t zone = mem;
 	uma_zone_t z;
 	uma_keg_t keg;
 
 	bzero(zone, size);
 	zone->uz_name = arg->name;
 	zone->uz_ctor = arg->ctor;
 	zone->uz_dtor = arg->dtor;
 	zone->uz_init = NULL;
 	zone->uz_fini = NULL;
 	zone->uz_sleeps = 0;
+	zone->uz_xdomain = 0;
 	zone->uz_count = 0;
 	zone->uz_count_min = 0;
 	zone->uz_count_max = BUCKET_MAX;
 	zone->uz_flags = 0;
 	zone->uz_warning = NULL;
 	/* The domain structures follow the cpu structures. */
 	zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
 	zone->uz_bkt_max = ULONG_MAX;
 	timevalclear(&zone->uz_ratecheck);
 
 	if (__predict_true(booted == BOOT_RUNNING))
 		zone_alloc_counters(zone);
 	else {
 		zone->uz_allocs = EARLY_COUNTER;
 		zone->uz_frees = EARLY_COUNTER;
 		zone->uz_fails = EARLY_COUNTER;
 	}
 
 	/*
 	 * This is a pure cache zone, no kegs.
 	 */
 	if (arg->import) {
 		if (arg->flags & UMA_ZONE_VM)
 			arg->flags |= UMA_ZFLAG_CACHEONLY;
 		zone->uz_flags = arg->flags;
 		zone->uz_size = arg->size;
 		zone->uz_import = arg->import;
 		zone->uz_release = arg->release;
 		zone->uz_arg = arg->arg;
 		zone->uz_lockptr = &zone->uz_lock;
 		ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
 		rw_wlock(&uma_rwlock);
 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
 		rw_wunlock(&uma_rwlock);
 		goto out;
 	}
 
 	/*
 	 * Use the regular zone/keg/slab allocator.
 	 */
 	zone->uz_import = (uma_import)zone_import;
 	zone->uz_release = (uma_release)zone_release;
 	zone->uz_arg = zone; 
 	keg = arg->keg;
 
 	if (arg->flags & UMA_ZONE_SECONDARY) {
 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
 		zone->uz_init = arg->uminit;
 		zone->uz_fini = arg->fini;
 		zone->uz_lockptr = &keg->uk_lock;
 		zone->uz_flags |= UMA_ZONE_SECONDARY;
 		rw_wlock(&uma_rwlock);
 		ZONE_LOCK(zone);
 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 			if (LIST_NEXT(z, uz_link) == NULL) {
 				LIST_INSERT_AFTER(z, zone, uz_link);
 				break;
 			}
 		}
 		ZONE_UNLOCK(zone);
 		rw_wunlock(&uma_rwlock);
 	} else if (keg == NULL) {
 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
 		    arg->align, arg->flags)) == NULL)
 			return (ENOMEM);
 	} else {
 		struct uma_kctor_args karg;
 		int error;
 
 		/* We should only be here from uma_startup() */
 		karg.size = arg->size;
 		karg.uminit = arg->uminit;
 		karg.fini = arg->fini;
 		karg.align = arg->align;
 		karg.flags = arg->flags;
 		karg.zone = zone;
 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
 		    flags);
 		if (error)
 			return (error);
 	}
 
 	zone->uz_keg = keg;
 	zone->uz_size = keg->uk_size;
 	zone->uz_flags |= (keg->uk_flags &
 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
 
 	/*
 	 * Some internal zones don't have room allocated for the per cpu
 	 * caches.  If we're internal, bail out here.
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		return (0);
 	}
 
 out:
 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
 	    ("Invalid zone flag combination"));
 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
 		zone->uz_count = BUCKET_MAX;
 	else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
 		zone->uz_count = 0;
 	else
 		zone->uz_count = bucket_select(zone->uz_size);
 	zone->uz_count_min = zone->uz_count;
 
 	return (0);
 }
 
 /*
  * Keg header dtor.  This frees all data, destroys locks, frees the hash
  * table and removes the keg from the global list.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 keg_dtor(void *arg, int size, void *udata)
 {
 	uma_keg_t keg;
 
 	keg = (uma_keg_t)arg;
 	KEG_LOCK(keg);
 	if (keg->uk_free != 0) {
 		printf("Freed UMA keg (%s) was not empty (%d items). "
 		    " Lost %d pages of memory.\n",
 		    keg->uk_name ? keg->uk_name : "",
 		    keg->uk_free, keg->uk_pages);
 	}
 	KEG_UNLOCK(keg);
 
 	hash_free(&keg->uk_hash);
 
 	KEG_LOCK_FINI(keg);
 }
 
 /*
  * Zone header dtor.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
 	uma_zone_t zone;
 	uma_keg_t keg;
 
 	zone = (uma_zone_t)arg;
 
 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
 
 	rw_wlock(&uma_rwlock);
 	LIST_REMOVE(zone, uz_link);
 	rw_wunlock(&uma_rwlock);
 	/*
 	 * XXX there are some races here where
 	 * the zone can be drained but zone lock
 	 * released and then refilled before we
 	 * remove it... we dont care for now
 	 */
 	zone_drain_wait(zone, M_WAITOK);
 	/*
 	 * We only destroy kegs from non secondary/non cache zones.
 	 */
 	if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
 		keg = zone->uz_keg;
 		rw_wlock(&uma_rwlock);
 		LIST_REMOVE(keg, uk_link);
 		rw_wunlock(&uma_rwlock);
 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
 	}
 	counter_u64_free(zone->uz_allocs);
 	counter_u64_free(zone->uz_frees);
 	counter_u64_free(zone->uz_fails);
 	if (zone->uz_lockptr == &zone->uz_lock)
 		ZONE_LOCK_FINI(zone);
 }
 
 /*
  * Traverses every zone in the system and calls a callback
  *
  * Arguments:
  *	zfunc  A pointer to a function which accepts a zone
  *		as an argument.
  *
  * Returns:
  *	Nothing
  */
 static void
 zone_foreach(void (*zfunc)(uma_zone_t))
 {
 	uma_keg_t keg;
 	uma_zone_t zone;
 
 	/*
 	 * Before BOOT_RUNNING we are guaranteed to be single
 	 * threaded, so locking isn't needed. Startup functions
 	 * are allowed to use M_WAITOK.
 	 */
 	if (__predict_true(booted == BOOT_RUNNING))
 		rw_rlock(&uma_rwlock);
 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
 			zfunc(zone);
 	}
 	if (__predict_true(booted == BOOT_RUNNING))
 		rw_runlock(&uma_rwlock);
 }
 
 /*
  * Count how many pages do we need to bootstrap.  VM supplies
  * its need in early zones in the argument, we add up our zones,
  * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
  * zone of zones and zone of kegs are accounted separately.
  */
 #define	UMA_BOOT_ZONES	11
 /* Zone of zones and zone of kegs have arbitrary alignment. */
 #define	UMA_BOOT_ALIGN	32
 static int zsize, ksize;
 int
 uma_startup_count(int vm_zones)
 {
 	int zones, pages;
 
 	ksize = sizeof(struct uma_keg) +
 	    (sizeof(struct uma_domain) * vm_ndomains);
 	zsize = sizeof(struct uma_zone) +
 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
 
 	/*
 	 * Memory for the zone of kegs and its keg,
 	 * and for zone of zones.
 	 */
 	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
 	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
 
 #ifdef	UMA_MD_SMALL_ALLOC
 	zones = UMA_BOOT_ZONES;
 #else
 	zones = UMA_BOOT_ZONES + vm_zones;
 	vm_zones = 0;
 #endif
 
 	/* Memory for the rest of startup zones, UMA and VM, ... */
 	if (zsize > UMA_SLAB_SPACE) {
 		/* See keg_large_init(). */
 		u_int ppera;
 
 		ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE);
 		if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) <
 		    SIZEOF_UMA_SLAB)
 			ppera++;
 		pages += (zones + vm_zones) * ppera;
 	} else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
 		/* See keg_small_init() special case for uk_ppera = 1. */
 		pages += zones;
 	else
 		pages += howmany(zones,
 		    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
 
 	/* ... and their kegs. Note that zone of zones allocates a keg! */
 	pages += howmany(zones + 1,
 	    UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
 
 	/*
 	 * Most of startup zones are not going to be offpages, that's
 	 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
 	 * calculations.  Some large bucket zones will be offpage, and
 	 * thus will allocate hashes.  We take conservative approach
 	 * and assume that all zones may allocate hash.  This may give
 	 * us some positive inaccuracy, usually an extra single page.
 	 */
 	pages += howmany(zones, UMA_SLAB_SPACE /
 	    (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
 
 	return (pages);
 }
 
 void
 uma_startup(void *mem, int npages)
 {
 	struct uma_zctor_args args;
 	uma_keg_t masterkeg;
 	uintptr_t m;
 
 #ifdef DIAGNOSTIC
 	printf("Entering %s with %d boot pages configured\n", __func__, npages);
 #endif
 
 	rw_init(&uma_rwlock, "UMA lock");
 
 	/* Use bootpages memory for the zone of zones and zone of kegs. */
 	m = (uintptr_t)mem;
 	zones = (uma_zone_t)m;
 	m += roundup(zsize, CACHE_LINE_SIZE);
 	kegs = (uma_zone_t)m;
 	m += roundup(zsize, CACHE_LINE_SIZE);
 	masterkeg = (uma_keg_t)m;
 	m += roundup(ksize, CACHE_LINE_SIZE);
 	m = roundup(m, PAGE_SIZE);
 	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
 	mem = (void *)m;
 
 	/* "manually" create the initial zone */
 	memset(&args, 0, sizeof(args));
 	args.name = "UMA Kegs";
 	args.size = ksize;
 	args.ctor = keg_ctor;
 	args.dtor = keg_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = masterkeg;
 	args.align = UMA_BOOT_ALIGN - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	zone_ctor(kegs, zsize, &args, M_WAITOK);
 
 	bootmem = mem;
 	boot_pages = npages;
 
 	args.name = "UMA Zones";
 	args.size = zsize;
 	args.ctor = zone_ctor;
 	args.dtor = zone_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = NULL;
 	args.align = UMA_BOOT_ALIGN - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	zone_ctor(zones, zsize, &args, M_WAITOK);
 
 	/* Now make a zone for slab headers */
 	slabzone = uma_zcreate("UMA Slabs",
 				sizeof(struct uma_slab),
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	bucket_init();
 
 	booted = BOOT_STRAPPED;
 }
 
 void
 uma_startup1(void)
 {
 
 #ifdef DIAGNOSTIC
 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
 #endif
 	booted = BOOT_PAGEALLOC;
 }
 
 void
 uma_startup2(void)
 {
 
 #ifdef DIAGNOSTIC
 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
 #endif
 	booted = BOOT_BUCKETS;
 	sx_init(&uma_drain_lock, "umadrain");
 	bucket_enable();
 }
 
 /*
  * Initialize our callout handle
  *
  */
 static void
 uma_startup3(void)
 {
 
 #ifdef INVARIANTS
 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
 #endif
 	zone_foreach(zone_alloc_counters);
 	callout_init(&uma_callout, 1);
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 	booted = BOOT_RUNNING;
 }
 
 static uma_keg_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 		int align, uint32_t flags)
 {
 	struct uma_kctor_args args;
 
 	args.size = size;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* Public functions */
 /* See uma.h */
 void
 uma_set_align(int align)
 {
 
 	if (align != UMA_ALIGN_CACHE)
 		uma_align_cache = align;
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
 
 {
 	struct uma_zctor_args args;
 	uma_zone_t res;
 	bool locked;
 
 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
 	    align, name));
 
+	/* Sets all zones to a first-touch domain policy. */
+#ifdef UMA_FIRSTTOUCH
+	flags |= UMA_ZONE_NUMA;
+#endif
+
 	/* This stuff is essential for the zone ctor */
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = uminit;
 	args.fini = fini;
 #ifdef  INVARIANTS
 	/*
 	 * If a zone is being created with an empty constructor and
 	 * destructor, pass UMA constructor/destructor which checks for
 	 * memory use after free.
 	 */
 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
 	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
 		args.ctor = trash_ctor;
 		args.dtor = trash_dtor;
 		args.uminit = trash_init;
 		args.fini = trash_fini;
 	}
 #endif
 	args.align = align;
 	args.flags = flags;
 	args.keg = NULL;
 
 	if (booted < BOOT_BUCKETS) {
 		locked = false;
 	} else {
 		sx_slock(&uma_drain_lock);
 		locked = true;
 	}
 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	if (locked)
 		sx_sunlock(&uma_drain_lock);
 	return (res);
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
 {
 	struct uma_zctor_args args;
 	uma_keg_t keg;
 	uma_zone_t res;
 	bool locked;
 
 	keg = master->uz_keg;
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = keg->uk_size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.align = keg->uk_align;
 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
 	args.keg = keg;
 
 	if (booted < BOOT_BUCKETS) {
 		locked = false;
 	} else {
 		sx_slock(&uma_drain_lock);
 		locked = true;
 	}
 	/* XXX Attaches only one keg of potentially many. */
 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	if (locked)
 		sx_sunlock(&uma_drain_lock);
 	return (res);
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_import zimport,
 		    uma_release zrelease, void *arg, int flags)
 {
 	struct uma_zctor_args args;
 
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.import = zimport;
 	args.release = zrelease;
 	args.arg = arg;
 	args.align = 0;
 	args.flags = flags | UMA_ZFLAG_CACHE;
 
 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* See uma.h */
 void
 uma_zdestroy(uma_zone_t zone)
 {
 
 	sx_slock(&uma_drain_lock);
 	zone_free_item(zones, zone, NULL, SKIP_NONE);
 	sx_sunlock(&uma_drain_lock);
 }
 
 void
 uma_zwait(uma_zone_t zone)
 {
 	void *item;
 
 	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
 	uma_zfree(zone, item);
 }
 
 void *
 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
 {
 	void *item;
 #ifdef SMP
 	int i;
 
 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
 #endif
 	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
 	if (item != NULL && (flags & M_ZERO)) {
 #ifdef SMP
 		for (i = 0; i <= mp_maxid; i++)
 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
 #else
 		bzero(item, zone->uz_size);
 #endif
 	}
 	return (item);
 }
 
 /*
  * A stub while both regular and pcpu cases are identical.
  */
 void
 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
 {
 
 #ifdef SMP
 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
 #endif
 	uma_zfree_arg(zone, item, udata);
 }
 
 /* See uma.h */
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
 	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
 	uma_cache_t cache;
 	void *item;
 	int cpu, domain, lockfail, maxbucket;
 #ifdef INVARIANTS
 	bool skipdbg;
 #endif
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 
 	/* This is the fast path allocation */
 	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
 	    curthread, zone->uz_name, zone, flags);
 
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
 	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zalloc_arg: called with spinlock or critical section held"));
 	if (zone->uz_flags & UMA_ZONE_PCPU)
 		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
 		    "with M_ZERO passed"));
 
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp_zone(zone)) {
 		item = memguard_alloc(zone->uz_size, flags);
 		if (item != NULL) {
 			if (zone->uz_init != NULL &&
 			    zone->uz_init(item, zone->uz_size, flags) != 0)
 				return (NULL);
 			if (zone->uz_ctor != NULL &&
 			    zone->uz_ctor(item, zone->uz_size, udata,
 			    flags) != 0) {
 			    	zone->uz_fini(item, zone->uz_size);
 				return (NULL);
 			}
 			return (item);
 		}
 		/* This is unfortunate but should not be fatal. */
 	}
 #endif
 	/*
 	 * If possible, allocate from the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to allocate from
 	 * the current cache; when we re-acquire the critical section, we
 	 * must detect and handle migration if it has occurred.
 	 */
 zalloc_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zalloc_start:
 	bucket = cache->uc_allocbucket;
 	if (bucket != NULL && bucket->ub_cnt > 0) {
 		bucket->ub_cnt--;
 		item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
 #endif
 		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
 		cache->uc_allocs++;
 		critical_exit();
 #ifdef INVARIANTS
 		skipdbg = uma_dbg_zskip(zone, item);
 #endif
 		if (zone->uz_ctor != NULL &&
 #ifdef INVARIANTS
 		    (!skipdbg || zone->uz_ctor != trash_ctor ||
 		    zone->uz_dtor != trash_dtor) &&
 #endif
 		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
 			counter_u64_add(zone->uz_fails, 1);
 			zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
 			return (NULL);
 		}
 #ifdef INVARIANTS
 		if (!skipdbg)
 			uma_dbg_alloc(zone, NULL, item);
 #endif
 		if (flags & M_ZERO)
 			uma_zero_item(item, zone);
 		return (item);
 	}
 
 	/*
 	 * We have run out of items in our alloc bucket.
 	 * See if we can switch with our free bucket.
 	 */
 	bucket = cache->uc_freebucket;
 	if (bucket != NULL && bucket->ub_cnt > 0) {
 		CTR2(KTR_UMA,
 		    "uma_zalloc: zone %s(%p) swapping empty with alloc",
 		    zone->uz_name, zone);
 		cache->uc_freebucket = cache->uc_allocbucket;
 		cache->uc_allocbucket = bucket;
 		goto zalloc_start;
 	}
 
 	/*
 	 * Discard any empty allocation bucket while we hold no locks.
 	 */
 	bucket = cache->uc_allocbucket;
 	cache->uc_allocbucket = NULL;
 	critical_exit();
 	if (bucket != NULL)
 		bucket_free(zone, bucket, udata);
 
-	if (zone->uz_flags & UMA_ZONE_NUMA) {
-		domain = PCPU_GET(domain);
-		if (VM_DOMAIN_EMPTY(domain))
-			domain = UMA_ANYDOMAIN;
-	} else
-		domain = UMA_ANYDOMAIN;
-
 	/* Short-circuit for zones without buckets and low memory. */
 	if (zone->uz_count == 0 || bucketdisable) {
 		ZONE_LOCK(zone);
+		if (zone->uz_flags & UMA_ZONE_NUMA)
+			domain = PCPU_GET(domain);
+		else
+			domain = UMA_ANYDOMAIN;
 		goto zalloc_item;
 	}
 
 	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
 	 * we must go back to the zone.  This requires the zone lock, so we
 	 * must drop the critical section, then re-acquire it when we go back
 	 * to the cache.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	lockfail = 0;
 	if (ZONE_TRYLOCK(zone) == 0) {
 		/* Record contention to size the buckets. */
 		ZONE_LOCK(zone);
 		lockfail = 1;
 	}
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 	/* See if we lost the race to fill the cache. */
 	if (cache->uc_allocbucket != NULL) {
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
 
 	/*
 	 * Check the zone's cache of buckets.
 	 */
-	if (domain == UMA_ANYDOMAIN)
-		zdom = &zone->uz_domain[0];
-	else
+	if (zone->uz_flags & UMA_ZONE_NUMA) {
+		domain = PCPU_GET(domain);
 		zdom = &zone->uz_domain[domain];
+	} else {
+		domain = UMA_ANYDOMAIN;
+		zdom = &zone->uz_domain[0];
+	}
+
 	if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 		cache->uc_allocbucket = bucket;
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/*
 	 * We bump the uz count when the cache size is insufficient to
 	 * handle the working set.
 	 */
 	if (lockfail && zone->uz_count < zone->uz_count_max)
 		zone->uz_count++;
 
 	if (zone->uz_max_items > 0) {
 		if (zone->uz_items >= zone->uz_max_items)
 			goto zalloc_item;
 		maxbucket = MIN(zone->uz_count,
 		    zone->uz_max_items - zone->uz_items);
 		zone->uz_items += maxbucket;
 	} else
 		maxbucket = zone->uz_count;
 	ZONE_UNLOCK(zone);
 
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
 	 * works we'll restart the allocation from the beginning and it
 	 * will use the just filled bucket.
 	 */
 	bucket = zone_alloc_bucket(zone, udata, domain, flags, maxbucket);
 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
 	    zone->uz_name, zone, bucket);
 	ZONE_LOCK(zone);
 	if (bucket != NULL) {
 		if (zone->uz_max_items > 0 && bucket->ub_cnt < maxbucket) {
 			MPASS(zone->uz_items >= maxbucket - bucket->ub_cnt);
 			zone->uz_items -= maxbucket - bucket->ub_cnt;
 			if (zone->uz_sleepers > 0 &&
 			    zone->uz_items < zone->uz_max_items)
 				wakeup_one(zone);
 		}
 		critical_enter();
 		cpu = curcpu;
 		cache = &zone->uz_cpu[cpu];
 
 		/*
 		 * See if we lost the race or were migrated.  Cache the
 		 * initialized bucket to make this less likely or claim
 		 * the memory directly.
 		 */
 		if (cache->uc_allocbucket == NULL &&
 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
 		    domain == PCPU_GET(domain))) {
 			cache->uc_allocbucket = bucket;
 			zdom->uzd_imax += bucket->ub_cnt;
 		} else if (zone->uz_bkt_count >= zone->uz_bkt_max) {
 			critical_exit();
 			ZONE_UNLOCK(zone);
 			bucket_drain(zone, bucket);
 			bucket_free(zone, bucket, udata);
 			goto zalloc_restart;
 		} else
 			zone_put_bucket(zone, zdom, bucket, false);
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	} else if (zone->uz_max_items > 0) {
 		zone->uz_items -= maxbucket;
 		if (zone->uz_sleepers > 0 &&
 		    zone->uz_items + 1 < zone->uz_max_items)
 			wakeup_one(zone);
 	}
 
 	/*
 	 * We may not be able to get a bucket so return an actual item.
 	 */
 zalloc_item:
 	item = zone_alloc_item_locked(zone, udata, domain, flags);
 
 	return (item);
 }
 
 void *
 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
 {
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 
 	/* This is the fast path allocation */
 	CTR5(KTR_UMA,
 	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
 	    curthread, zone->uz_name, zone, domain, flags);
 
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
 	}
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
 
 	return (zone_alloc_item(zone, udata, domain, flags));
 }
 
 /*
  * Find a slab with some space.  Prefer slabs that are partially used over those
  * that are totally full.  This helps to reduce fragmentation.
  *
  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
  * only 'domain'.
  */
 static uma_slab_t
 keg_first_slab(uma_keg_t keg, int domain, bool rr)
 {
 	uma_domain_t dom;
 	uma_slab_t slab;
 	int start;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("keg_first_slab: domain %d out of range", domain));
 	KEG_LOCK_ASSERT(keg);
 
 	slab = NULL;
 	start = domain;
 	do {
 		dom = &keg->uk_domain[domain];
 		if (!LIST_EMPTY(&dom->ud_part_slab))
 			return (LIST_FIRST(&dom->ud_part_slab));
 		if (!LIST_EMPTY(&dom->ud_free_slab)) {
 			slab = LIST_FIRST(&dom->ud_free_slab);
 			LIST_REMOVE(slab, us_link);
 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 			return (slab);
 		}
 		if (rr)
 			domain = (domain + 1) % vm_ndomains;
 	} while (domain != start);
 
 	return (NULL);
 }
 
 static uma_slab_t
 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
 {
 	uint32_t reserve;
 
 	KEG_LOCK_ASSERT(keg);
 
 	reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
 	if (keg->uk_free <= reserve)
 		return (NULL);
 	return (keg_first_slab(keg, domain, rr));
 }
 
 static uma_slab_t
 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
 {
 	struct vm_domainset_iter di;
 	uma_domain_t dom;
 	uma_slab_t slab;
 	int aflags, domain;
 	bool rr;
 
 restart:
 	KEG_LOCK_ASSERT(keg);
 
 	/*
 	 * Use the keg's policy if upper layers haven't already specified a
 	 * domain (as happens with first-touch zones).
 	 *
 	 * To avoid races we run the iterator with the keg lock held, but that
 	 * means that we cannot allow the vm_domainset layer to sleep.  Thus,
 	 * clear M_WAITOK and handle low memory conditions locally.
 	 */
 	rr = rdomain == UMA_ANYDOMAIN;
 	if (rr) {
 		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
 		    &aflags);
 	} else {
 		aflags = flags;
 		domain = rdomain;
 	}
 
 	for (;;) {
 		slab = keg_fetch_free_slab(keg, domain, rr, flags);
 		if (slab != NULL) {
 			MPASS(slab->us_keg == keg);
 			return (slab);
 		}
 
 		/*
 		 * M_NOVM means don't ask at all!
 		 */
 		if (flags & M_NOVM)
 			break;
 
 		KASSERT(zone->uz_max_items == 0 ||
 		    zone->uz_items <= zone->uz_max_items,
 		    ("%s: zone %p overflow", __func__, zone));
 
 		slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
 		 * at least one item.
 		 */
 		if (slab) {
 			MPASS(slab->us_keg == keg);
 			dom = &keg->uk_domain[slab->us_domain];
 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 			return (slab);
 		}
 		KEG_LOCK(keg);
 		if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
 			if ((flags & M_WAITOK) != 0) {
 				KEG_UNLOCK(keg);
 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
 				KEG_LOCK(keg);
 				goto restart;
 			}
 			break;
 		}
 	}
 
 	/*
 	 * We might not have been able to get a slab but another cpu
 	 * could have while we were unlocked.  Check again before we
 	 * fail.
 	 */
 	if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) {
 		MPASS(slab->us_keg == keg);
 		return (slab);
 	}
 	return (NULL);
 }
 
 static uma_slab_t
 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
 {
 	uma_slab_t slab;
 
 	if (keg == NULL) {
 		keg = zone->uz_keg;
 		KEG_LOCK(keg);
 	}
 
 	for (;;) {
 		slab = keg_fetch_slab(keg, zone, domain, flags);
 		if (slab)
 			return (slab);
 		if (flags & (M_NOWAIT | M_NOVM))
 			break;
 	}
 	KEG_UNLOCK(keg);
 	return (NULL);
 }
 
 static void *
 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
 {
 	uma_domain_t dom;
 	void *item;
 	uint8_t freei;
 
 	MPASS(keg == slab->us_keg);
 	KEG_LOCK_ASSERT(keg);
 
 	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
 	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
 	item = slab->us_data + (keg->uk_rsize * freei);
 	slab->us_freecount--;
 	keg->uk_free--;
 
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		dom = &keg->uk_domain[slab->us_domain];
 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
 	}
 
 	return (item);
 }
 
 static int
 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
 {
 	uma_slab_t slab;
 	uma_keg_t keg;
 #ifdef NUMA
 	int stripe;
 #endif
 	int i;
 
 	slab = NULL;
 	keg = NULL;
 	/* Try to keep the buckets totally full */
 	for (i = 0; i < max; ) {
 		if ((slab = zone_fetch_slab(zone, keg, domain, flags)) == NULL)
 			break;
 		keg = slab->us_keg;
 #ifdef NUMA
 		stripe = howmany(max, vm_ndomains);
 #endif
 		while (slab->us_freecount && i < max) { 
 			bucket[i++] = slab_alloc_item(keg, slab);
 			if (keg->uk_free <= keg->uk_reserve)
 				break;
 #ifdef NUMA
 			/*
 			 * If the zone is striped we pick a new slab for every
 			 * N allocations.  Eliminating this conditional will
 			 * instead pick a new domain for each bucket rather
 			 * than stripe within each bucket.  The current option
 			 * produces more fragmentation and requires more cpu
 			 * time but yields better distribution.
 			 */
 			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
 			    vm_ndomains > 1 && --stripe == 0)
 				break;
 #endif
 		}
 		/* Don't block if we allocated any successfully. */
 		flags &= ~M_WAITOK;
 		flags |= M_NOWAIT;
 	}
 	if (slab != NULL)
 		KEG_UNLOCK(keg);
 
 	return i;
 }
 
 static uma_bucket_t
 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags, int max)
 {
 	uma_bucket_t bucket;
 
 	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
 
+	/* Avoid allocs targeting empty domains. */
+	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
+		domain = UMA_ANYDOMAIN;
+
 	/* Don't wait for buckets, preserve caller's NOVM setting. */
 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
 	if (bucket == NULL)
 		return (NULL);
 
 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
 	    MIN(max, bucket->ub_entries), domain, flags);
 
 	/*
 	 * Initialize the memory if necessary.
 	 */
 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
 		int i;
 
 		for (i = 0; i < bucket->ub_cnt; i++)
 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
 			    flags) != 0)
 				break;
 		/*
 		 * If we couldn't initialize the whole bucket, put the
 		 * rest back onto the freelist.
 		 */
 		if (i != bucket->ub_cnt) {
 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
 			    bucket->ub_cnt - i);
 #ifdef INVARIANTS
 			bzero(&bucket->ub_bucket[i],
 			    sizeof(void *) * (bucket->ub_cnt - i));
 #endif
 			bucket->ub_cnt = i;
 		}
 	}
 
 	if (bucket->ub_cnt == 0) {
 		bucket_free(zone, bucket, udata);
 		counter_u64_add(zone->uz_fails, 1);
 		return (NULL);
 	}
 
 	return (bucket);
 }
 
 /*
  * Allocates a single item from a zone.
  *
  * Arguments
  *	zone   The zone to alloc for.
  *	udata  The data to be passed to the constructor.
  *	domain The domain to allocate from or UMA_ANYDOMAIN.
  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
  *
  * Returns
  *	NULL if there is no memory and M_NOWAIT is set
  *	An item if successful
  */
 
 static void *
 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
 {
 
 	ZONE_LOCK(zone);
 	return (zone_alloc_item_locked(zone, udata, domain, flags));
 }
 
 /*
  * Returns with zone unlocked.
  */
 static void *
 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags)
 {
 	void *item;
 #ifdef INVARIANTS
 	bool skipdbg;
 #endif
 
 	ZONE_LOCK_ASSERT(zone);
 
 	if (zone->uz_max_items > 0) {
 		if (zone->uz_items >= zone->uz_max_items) {
 			zone_log_warning(zone);
 			zone_maxaction(zone);
 			if (flags & M_NOWAIT) {
 				ZONE_UNLOCK(zone);
 				return (NULL);
 			}
 			zone->uz_sleeps++;
 			zone->uz_sleepers++;
 			while (zone->uz_items >= zone->uz_max_items)
 				mtx_sleep(zone, zone->uz_lockptr, PVM,
 				    "zonelimit", 0);
 			zone->uz_sleepers--;
 			if (zone->uz_sleepers > 0 &&
 			    zone->uz_items + 1 < zone->uz_max_items)
 				wakeup_one(zone);
 		}
 		zone->uz_items++;
 	}
 	ZONE_UNLOCK(zone);
 
-	if (domain != UMA_ANYDOMAIN) {
-		/* avoid allocs targeting empty domains */
-		if (VM_DOMAIN_EMPTY(domain))
-			domain = UMA_ANYDOMAIN;
-	}
+	/* Avoid allocs targeting empty domains. */
+	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
+		domain = UMA_ANYDOMAIN;
+
 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
 		goto fail;
 
 #ifdef INVARIANTS
 	skipdbg = uma_dbg_zskip(zone, item);
 #endif
 	/*
 	 * We have to call both the zone's init (not the keg's init)
 	 * and the zone's ctor.  This is because the item is going from
 	 * a keg slab directly to the user, and the user is expecting it
 	 * to be both zone-init'd as well as zone-ctor'd.
 	 */
 	if (zone->uz_init != NULL) {
 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
 			zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
 			goto fail;
 		}
 	}
 	if (zone->uz_ctor != NULL &&
 #ifdef INVARIANTS
 	    (!skipdbg || zone->uz_ctor != trash_ctor ||
 	    zone->uz_dtor != trash_dtor) &&
 #endif
 	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
 		zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
 		goto fail;
 	}
 #ifdef INVARIANTS
 	if (!skipdbg)
 		uma_dbg_alloc(zone, NULL, item);
 #endif
 	if (flags & M_ZERO)
 		uma_zero_item(item, zone);
 
 	counter_u64_add(zone->uz_allocs, 1);
 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
 	    zone->uz_name, zone);
 
 	return (item);
 
 fail:
 	if (zone->uz_max_items > 0) {
 		ZONE_LOCK(zone);
 		zone->uz_items--;
 		ZONE_UNLOCK(zone);
 	}
 	counter_u64_add(zone->uz_fails, 1);
 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
 	    zone->uz_name, zone);
 	return (NULL);
 }
 
 /* See uma.h */
 void
 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 {
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	uma_zone_domain_t zdom;
 	int cpu, domain;
+#ifdef UMA_XDOMAIN
+	int itemdomain;
+#endif
 	bool lockfail;
 #ifdef INVARIANTS
 	bool skipdbg;
 #endif
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 
 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
 	    zone->uz_name);
 
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zfree_arg: called with spinlock or critical section held"));
 
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(item)) {
 		if (zone->uz_dtor != NULL)
 			zone->uz_dtor(item, zone->uz_size, udata);
 		if (zone->uz_fini != NULL)
 			zone->uz_fini(item, zone->uz_size);
 		memguard_free(item);
 		return;
 	}
 #endif
 #ifdef INVARIANTS
 	skipdbg = uma_dbg_zskip(zone, item);
 	if (skipdbg == false) {
 		if (zone->uz_flags & UMA_ZONE_MALLOC)
 			uma_dbg_free(zone, udata, item);
 		else
 			uma_dbg_free(zone, NULL, item);
 	}
 	if (zone->uz_dtor != NULL && (!skipdbg ||
 	    zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
 #else
 	if (zone->uz_dtor != NULL)
 #endif
 		zone->uz_dtor(item, zone->uz_size, udata);
 
 	/*
 	 * The race here is acceptable.  If we miss it we'll just have to wait
 	 * a little longer for the limits to be reset.
 	 */
 	if (zone->uz_sleepers > 0)
 		goto zfree_item;
 
+#ifdef UMA_XDOMAIN
+	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
+		itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
+#endif
+
 	/*
 	 * If possible, free to the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to free to the
 	 * current cache; when we re-acquire the critical section, we must
 	 * detect and handle migration if it has occurred.
 	 */
 zfree_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zfree_start:
+	domain = PCPU_GET(domain);
+#ifdef UMA_XDOMAIN
+	if ((zone->uz_flags & UMA_ZONE_NUMA) == 0)
+		itemdomain = domain;
+#endif
 	/*
 	 * Try to free into the allocbucket first to give LIFO ordering
 	 * for cache-hot datastructures.  Spill over into the freebucket
 	 * if necessary.  Alloc will swap them if one runs dry.
 	 */
-	bucket = cache->uc_allocbucket;
-	if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
-		bucket = cache->uc_freebucket;
+#ifdef UMA_XDOMAIN
+	if (domain != itemdomain) {
+		bucket = cache->uc_crossbucket;
+	} else
+#endif
+	{
+		bucket = cache->uc_allocbucket;
+		if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
+			bucket = cache->uc_freebucket;
+	}
 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
 		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
 		    ("uma_zfree: Freeing to non free bucket index."));
 		bucket->ub_bucket[bucket->ub_cnt] = item;
 		bucket->ub_cnt++;
 		cache->uc_frees++;
 		critical_exit();
 		return;
 	}
 
 	/*
 	 * We must go back the zone, which requires acquiring the zone lock,
 	 * which in turn means we must release and re-acquire the critical
 	 * section.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	critical_exit();
 	if (zone->uz_count == 0 || bucketdisable)
 		goto zfree_item;
 
 	lockfail = false;
 	if (ZONE_TRYLOCK(zone) == 0) {
 		/* Record contention to size the buckets. */
 		ZONE_LOCK(zone);
 		lockfail = true;
 	}
 	critical_enter();
 	cpu = curcpu;
+	domain = PCPU_GET(domain);
 	cache = &zone->uz_cpu[cpu];
 
-	bucket = cache->uc_freebucket;
+#ifdef UMA_XDOMAIN
+	if (domain != itemdomain)
+		bucket = cache->uc_crossbucket;
+	else
+#endif
+		bucket = cache->uc_freebucket;
 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
 		ZONE_UNLOCK(zone);
 		goto zfree_start;
 	}
-	cache->uc_freebucket = NULL;
+#ifdef UMA_XDOMAIN
+	if (domain != itemdomain)
+		cache->uc_crossbucket = NULL;
+	else
+#endif
+		cache->uc_freebucket = NULL;
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
+#ifdef UMA_XDOMAIN
+	if (domain != itemdomain) {
+		if (bucket != NULL) {
+			zone->uz_xdomain += bucket->ub_cnt;
+			if (vm_ndomains > 2 ||
+			    zone->uz_bkt_count >= zone->uz_bkt_max) {
+				ZONE_UNLOCK(zone);
+				bucket_drain(zone, bucket);
+				bucket_free(zone, bucket, udata);
+			} else {
+				zdom = &zone->uz_domain[itemdomain];
+				zone_put_bucket(zone, zdom, bucket, true);
+				ZONE_UNLOCK(zone);
+			}
+		} else
+			ZONE_UNLOCK(zone);
+		bucket = bucket_alloc(zone, udata, M_NOWAIT);
+		if (bucket == NULL)
+			goto zfree_item;
+		critical_enter();
+		cpu = curcpu;
+		cache = &zone->uz_cpu[cpu];
+		if (cache->uc_crossbucket == NULL) {
+			cache->uc_crossbucket = bucket;
+			goto zfree_start;
+		}
+		critical_exit();
+		bucket_free(zone, bucket, udata);
+		goto zfree_restart;
+	}
+#endif
+
 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
-		domain = PCPU_GET(domain);
-		if (VM_DOMAIN_EMPTY(domain))
-			domain = UMA_ANYDOMAIN;
-	} else
+		zdom = &zone->uz_domain[domain];
+	} else {
 		domain = 0;
-	zdom = &zone->uz_domain[0];
+		zdom = &zone->uz_domain[0];
+	}
 
 	/* Can we throw this on the zone full list? */
 	if (bucket != NULL) {
 		CTR3(KTR_UMA,
 		    "uma_zfree: zone %s(%p) putting bucket %p on free list",
 		    zone->uz_name, zone, bucket);
 		/* ub_cnt is pointing to the last free item */
 		KASSERT(bucket->ub_cnt == bucket->ub_entries,
 		    ("uma_zfree: Attempting to insert not full bucket onto the full list.\n"));
 		if (zone->uz_bkt_count >= zone->uz_bkt_max) {
 			ZONE_UNLOCK(zone);
 			bucket_drain(zone, bucket);
 			bucket_free(zone, bucket, udata);
 			goto zfree_restart;
 		} else
 			zone_put_bucket(zone, zdom, bucket, true);
 	}
 
 	/*
 	 * We bump the uz count when the cache size is insufficient to
 	 * handle the working set.
 	 */
 	if (lockfail && zone->uz_count < zone->uz_count_max)
 		zone->uz_count++;
 	ZONE_UNLOCK(zone);
 
 	bucket = bucket_alloc(zone, udata, M_NOWAIT);
 	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
 	    zone->uz_name, zone, bucket);
 	if (bucket) {
 		critical_enter();
 		cpu = curcpu;
 		cache = &zone->uz_cpu[cpu];
 		if (cache->uc_freebucket == NULL &&
 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
 		    domain == PCPU_GET(domain))) {
 			cache->uc_freebucket = bucket;
 			goto zfree_start;
 		}
 		/*
 		 * We lost the race, start over.  We have to drop our
 		 * critical section to free the bucket.
 		 */
 		critical_exit();
 		bucket_free(zone, bucket, udata);
 		goto zfree_restart;
 	}
 
 	/*
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
 zfree_item:
 	zone_free_item(zone, item, udata, SKIP_DTOR);
 }
 
 void
 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
 {
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 
 	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
 	    zone->uz_name);
 
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zfree_domain: called with spinlock or critical section held"));
 
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
 	zone_free_item(zone, item, udata, SKIP_NONE);
 }
 
 static void
 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
 	uma_domain_t dom;
 	uint8_t freei;
 
 	keg = zone->uz_keg;
 	MPASS(zone->uz_lockptr == &keg->uk_lock);
 	KEG_LOCK_ASSERT(keg);
 	MPASS(keg == slab->us_keg);
 
 	dom = &keg->uk_domain[slab->us_domain];
 
 	/* Do we need to remove from any lists? */
 	if (slab->us_freecount+1 == keg->uk_ipers) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
 	} else if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 	}
 
 	/* Slab management. */
 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
 	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
 	slab->us_freecount++;
 
 	/* Keg statistics. */
 	keg->uk_free++;
 }
 
 static void
 zone_release(uma_zone_t zone, void **bucket, int cnt)
 {
 	void *item;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	uint8_t *mem;
 	int i;
 
 	keg = zone->uz_keg;
 	KEG_LOCK(keg);
 	for (i = 0; i < cnt; i++) {
 		item = bucket[i];
 		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 			if (zone->uz_flags & UMA_ZONE_HASH) {
 				slab = hash_sfind(&keg->uk_hash, mem);
 			} else {
 				mem += keg->uk_pgoff;
 				slab = (uma_slab_t)mem;
 			}
 		} else {
 			slab = vtoslab((vm_offset_t)item);
 			MPASS(slab->us_keg == keg);
 		}
 		slab_free_item(zone, slab, item);
 	}
 	KEG_UNLOCK(keg);
 }
 
 /*
  * Frees a single item to any zone.
  *
  * Arguments:
  *	zone   The zone to free to
  *	item   The item we're freeing
  *	udata  User supplied data for the dtor
  *	skip   Skip dtors and finis
  */
 static void
 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
 {
 #ifdef INVARIANTS
 	bool skipdbg;
 
 	skipdbg = uma_dbg_zskip(zone, item);
 	if (skip == SKIP_NONE && !skipdbg) {
 		if (zone->uz_flags & UMA_ZONE_MALLOC)
 			uma_dbg_free(zone, udata, item);
 		else
 			uma_dbg_free(zone, NULL, item);
 	}
 
 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
 	    (!skipdbg || zone->uz_dtor != trash_dtor ||
 	    zone->uz_ctor != trash_ctor))
 #else
 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
 #endif
 		zone->uz_dtor(item, zone->uz_size, udata);
 
 	if (skip < SKIP_FINI && zone->uz_fini)
 		zone->uz_fini(item, zone->uz_size);
 
 	zone->uz_release(zone->uz_arg, &item, 1);
 
 	if (skip & SKIP_CNT)
 		return;
 
 	counter_u64_add(zone->uz_frees, 1);
 
 	if (zone->uz_max_items > 0) {
 		ZONE_LOCK(zone);
 		zone->uz_items--;
 		if (zone->uz_sleepers > 0 &&
 		    zone->uz_items < zone->uz_max_items)
 			wakeup_one(zone);
 		ZONE_UNLOCK(zone);
 	}
 }
 
 /* See uma.h */
 int
 uma_zone_set_max(uma_zone_t zone, int nitems)
 {
 	struct uma_bucket_zone *ubz;
 
 	/*
 	 * If limit is very low we may need to limit how
 	 * much items are allowed in CPU caches.
 	 */
 	ubz = &bucket_zones[0];
 	for (; ubz->ubz_entries != 0; ubz++)
 		if (ubz->ubz_entries * 2 * mp_ncpus > nitems)
 			break;
 	if (ubz == &bucket_zones[0])
 		nitems = ubz->ubz_entries * 2 * mp_ncpus;
 	else
 		ubz--;
 
 	ZONE_LOCK(zone);
 	zone->uz_count_max = zone->uz_count = ubz->ubz_entries;
 	if (zone->uz_count_min > zone->uz_count_max)
 		zone->uz_count_min = zone->uz_count_max;
 	zone->uz_max_items = nitems;
 	ZONE_UNLOCK(zone);
 
 	return (nitems);
 }
 
 /* See uma.h */
 int
 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
 {
 
 	ZONE_LOCK(zone);
 	zone->uz_bkt_max = nitems;
 	ZONE_UNLOCK(zone);
 
 	return (nitems);
 }
 
 /* See uma.h */
 int
 uma_zone_get_max(uma_zone_t zone)
 {
 	int nitems;
 
 	ZONE_LOCK(zone);
 	nitems = zone->uz_max_items;
 	ZONE_UNLOCK(zone);
 
 	return (nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_warning(uma_zone_t zone, const char *warning)
 {
 
 	ZONE_LOCK(zone);
 	zone->uz_warning = warning;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
 {
 
 	ZONE_LOCK(zone);
 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 int
 uma_zone_get_cur(uma_zone_t zone)
 {
 	int64_t nitems;
 	u_int i;
 
 	ZONE_LOCK(zone);
 	nitems = counter_u64_fetch(zone->uz_allocs) -
 	    counter_u64_fetch(zone->uz_frees);
 	CPU_FOREACH(i) {
 		/*
 		 * See the comment in uma_vm_zone_stats() regarding the
 		 * safety of accessing the per-cpu caches. With the zone lock
 		 * held, it is safe, but can potentially result in stale data.
 		 */
 		nitems += zone->uz_cpu[i].uc_allocs -
 		    zone->uz_cpu[i].uc_frees;
 	}
 	ZONE_UNLOCK(zone);
 
 	return (nitems < 0 ? 0 : nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_LOCK(keg);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_init on non-empty keg"));
 	keg->uk_init = uminit;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 void
 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_LOCK(keg);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_fini on non-empty keg"));
 	keg->uk_fini = fini;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
 {
 
 	ZONE_LOCK(zone);
 	KASSERT(zone->uz_keg->uk_pages == 0,
 	    ("uma_zone_set_zinit on non-empty keg"));
 	zone->uz_init = zinit;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
 {
 
 	ZONE_LOCK(zone);
 	KASSERT(zone->uz_keg->uk_pages == 0,
 	    ("uma_zone_set_zfini on non-empty keg"));
 	zone->uz_fini = zfini;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 /* XXX uk_freef is not actually used with the zone locked */
 void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
 	KEG_LOCK(keg);
 	keg->uk_freef = freef;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 /* XXX uk_allocf is not actually used with the zone locked */
 void
 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_LOCK(keg);
 	keg->uk_allocf = allocf;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 void
 uma_zone_reserve(uma_zone_t zone, int items)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_LOCK(keg);
 	keg->uk_reserve = items;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 int
 uma_zone_reserve_kva(uma_zone_t zone, int count)
 {
 	uma_keg_t keg;
 	vm_offset_t kva;
 	u_int pages;
 
 	KEG_GET(zone, keg);
 
 	pages = count / keg->uk_ipers;
 	if (pages * keg->uk_ipers < count)
 		pages++;
 	pages *= keg->uk_ppera;
 
 #ifdef UMA_MD_SMALL_ALLOC
 	if (keg->uk_ppera > 1) {
 #else
 	if (1) {
 #endif
 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
 		if (kva == 0)
 			return (0);
 	} else
 		kva = 0;
 
 	ZONE_LOCK(zone);
 	MPASS(keg->uk_kva == 0);
 	keg->uk_kva = kva;
 	keg->uk_offset = 0;
 	zone->uz_max_items = pages * keg->uk_ipers;
 #ifdef UMA_MD_SMALL_ALLOC
 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
 #else
 	keg->uk_allocf = noobj_alloc;
 #endif
 	keg->uk_flags |= UMA_ZONE_NOFREE;
 	ZONE_UNLOCK(zone);
 
 	return (1);
 }
 
 /* See uma.h */
 void
 uma_prealloc(uma_zone_t zone, int items)
 {
 	struct vm_domainset_iter di;
 	uma_domain_t dom;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int aflags, domain, slabs;
 
 	KEG_GET(zone, keg);
 	KEG_LOCK(keg);
 	slabs = items / keg->uk_ipers;
 	if (slabs * keg->uk_ipers < items)
 		slabs++;
 	while (slabs-- > 0) {
 		aflags = M_NOWAIT;
 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
 		    &aflags);
 		for (;;) {
 			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
 			    aflags);
 			if (slab != NULL) {
 				MPASS(slab->us_keg == keg);
 				dom = &keg->uk_domain[slab->us_domain];
 				LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
 				    us_link);
 				break;
 			}
 			KEG_LOCK(keg);
 			if (vm_domainset_iter_policy(&di, &domain) != 0) {
 				KEG_UNLOCK(keg);
 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
 				KEG_LOCK(keg);
 			}
 		}
 	}
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 static void
 uma_reclaim_locked(bool kmem_danger)
 {
 
 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
 	sx_assert(&uma_drain_lock, SA_XLOCKED);
 	bucket_enable();
 	zone_foreach(zone_drain);
 	if (vm_page_count_min() || kmem_danger) {
 		cache_drain_safe(NULL);
 		zone_foreach(zone_drain);
 	}
 
 	/*
 	 * Some slabs may have been freed but this zone will be visited early
 	 * we visit again so that we can free pages that are empty once other
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
 	bucket_zone_drain();
 }
 
 void
 uma_reclaim(void)
 {
 
 	sx_xlock(&uma_drain_lock);
 	uma_reclaim_locked(false);
 	sx_xunlock(&uma_drain_lock);
 }
 
 static volatile int uma_reclaim_needed;
 
 void
 uma_reclaim_wakeup(void)
 {
 
 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
 		wakeup(uma_reclaim);
 }
 
 void
 uma_reclaim_worker(void *arg __unused)
 {
 
 	for (;;) {
 		sx_xlock(&uma_drain_lock);
 		while (atomic_load_int(&uma_reclaim_needed) == 0)
 			sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
 			    hz);
 		sx_xunlock(&uma_drain_lock);
 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
 		sx_xlock(&uma_drain_lock);
 		uma_reclaim_locked(true);
 		atomic_store_int(&uma_reclaim_needed, 0);
 		sx_xunlock(&uma_drain_lock);
 		/* Don't fire more than once per-second. */
 		pause("umarclslp", hz);
 	}
 }
 
 /* See uma.h */
 int
 uma_zone_exhausted(uma_zone_t zone)
 {
 	int full;
 
 	ZONE_LOCK(zone);
 	full = zone->uz_sleepers > 0;
 	ZONE_UNLOCK(zone);
 	return (full);	
 }
 
 int
 uma_zone_exhausted_nolock(uma_zone_t zone)
 {
 	return (zone->uz_sleepers > 0);
 }
 
 void *
 uma_large_malloc_domain(vm_size_t size, int domain, int wait)
 {
 	struct domainset *policy;
 	vm_offset_t addr;
 	uma_slab_t slab;
 
 	if (domain != UMA_ANYDOMAIN) {
 		/* avoid allocs targeting empty domains */
 		if (VM_DOMAIN_EMPTY(domain))
 			domain = UMA_ANYDOMAIN;
 	}
 	slab = zone_alloc_item(slabzone, NULL, domain, wait);
 	if (slab == NULL)
 		return (NULL);
 	policy = (domain == UMA_ANYDOMAIN) ? DOMAINSET_RR() :
 	    DOMAINSET_FIXED(domain);
 	addr = kmem_malloc_domainset(policy, size, wait);
 	if (addr != 0) {
 		vsetslab(addr, slab);
 		slab->us_data = (void *)addr;
 		slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
 		slab->us_size = size;
 		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
 		    pmap_kextract(addr)));
 		uma_total_inc(size);
 	} else {
 		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
 	}
 
 	return ((void *)addr);
 }
 
 void *
 uma_large_malloc(vm_size_t size, int wait)
 {
 
 	return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
 }
 
 void
 uma_large_free(uma_slab_t slab)
 {
 
 	KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
 	    ("uma_large_free:  Memory not allocated with uma_large_malloc."));
 	kmem_free((vm_offset_t)slab->us_data, slab->us_size);
 	uma_total_dec(slab->us_size);
 	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
 }
 
 static void
 uma_zero_item(void *item, uma_zone_t zone)
 {
 
 	bzero(item, zone->uz_size);
 }
 
 unsigned long
 uma_limit(void)
 {
 
 	return (uma_kmem_limit);
 }
 
 void
 uma_set_limit(unsigned long limit)
 {
 
 	uma_kmem_limit = limit;
 }
 
 unsigned long
 uma_size(void)
 {
 
 	return (atomic_load_long(&uma_kmem_total));
 }
 
 long
 uma_avail(void)
 {
 
 	return (uma_kmem_limit - uma_size());
 }
 
 void
 uma_print_stats(void)
 {
 	zone_foreach(uma_print_zone);
 }
 
 static void
 slab_print(uma_slab_t slab)
 {
 	printf("slab: keg %p, data %p, freecount %d\n",
 		slab->us_keg, slab->us_data, slab->us_freecount);
 }
 
 static void
 cache_print(uma_cache_t cache)
 {
-	printf("alloc: %p(%d), free: %p(%d)\n",
+	printf("alloc: %p(%d), free: %p(%d), cross: %p(%d)j\n",
 		cache->uc_allocbucket,
 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
 		cache->uc_freebucket,
-		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
+		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0,
+		cache->uc_crossbucket,
+		cache->uc_crossbucket?cache->uc_crossbucket->ub_cnt:0);
 }
 
 static void
 uma_print_keg(uma_keg_t keg)
 {
 	uma_domain_t dom;
 	uma_slab_t slab;
 	int i;
 
 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
 	    "out %d free %d\n",
 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
 	    keg->uk_free);
 	for (i = 0; i < vm_ndomains; i++) {
 		dom = &keg->uk_domain[i];
 		printf("Part slabs:\n");
 		LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
 			slab_print(slab);
 		printf("Free slabs:\n");
 		LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
 			slab_print(slab);
 		printf("Full slabs:\n");
 		LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
 			slab_print(slab);
 	}
 }
 
 void
 uma_print_zone(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	int i;
 
 	printf("zone: %s(%p) size %d maxitems %ju flags %#x\n",
 	    zone->uz_name, zone, zone->uz_size, (uintmax_t)zone->uz_max_items,
 	    zone->uz_flags);
 	if (zone->uz_lockptr != &zone->uz_lock)
 		uma_print_keg(zone->uz_keg);
 	CPU_FOREACH(i) {
 		cache = &zone->uz_cpu[i];
 		printf("CPU %d Cache:\n", i);
 		cache_print(cache);
 	}
 }
 
 #ifdef DDB
 /*
  * Generate statistics across both the zone and its per-cpu cache's.  Return
  * desired statistics if the pointer is non-NULL for that statistic.
  *
  * Note: does not update the zone statistics, as it can't safely clear the
  * per-CPU cache statistic.
  *
  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
  * safe from off-CPU; we should modify the caches to track this information
  * directly so that we don't have to.
  */
 static void
 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
-    uint64_t *freesp, uint64_t *sleepsp)
+    uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
 {
 	uma_cache_t cache;
-	uint64_t allocs, frees, sleeps;
+	uint64_t allocs, frees, sleeps, xdomain;
 	int cachefree, cpu;
 
-	allocs = frees = sleeps = 0;
+	allocs = frees = sleeps = xdomain = 0;
 	cachefree = 0;
 	CPU_FOREACH(cpu) {
 		cache = &z->uz_cpu[cpu];
 		if (cache->uc_allocbucket != NULL)
 			cachefree += cache->uc_allocbucket->ub_cnt;
 		if (cache->uc_freebucket != NULL)
 			cachefree += cache->uc_freebucket->ub_cnt;
+		if (cache->uc_crossbucket != NULL) {
+			xdomain += cache->uc_crossbucket->ub_cnt;
+			cachefree += cache->uc_crossbucket->ub_cnt;
+		}
 		allocs += cache->uc_allocs;
 		frees += cache->uc_frees;
 	}
 	allocs += counter_u64_fetch(z->uz_allocs);
 	frees += counter_u64_fetch(z->uz_frees);
 	sleeps += z->uz_sleeps;
+	xdomain += z->uz_xdomain;
 	if (cachefreep != NULL)
 		*cachefreep = cachefree;
 	if (allocsp != NULL)
 		*allocsp = allocs;
 	if (freesp != NULL)
 		*freesp = frees;
 	if (sleepsp != NULL)
 		*sleepsp = sleeps;
+	if (xdomainp != NULL)
+		*xdomainp = xdomain;
 }
 #endif /* DDB */
 
 static int
 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
 {
 	uma_keg_t kz;
 	uma_zone_t z;
 	int count;
 
 	count = 0;
 	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 	LIST_FOREACH(z, &uma_cachezones, uz_link)
 		count++;
 
 	rw_runlock(&uma_rwlock);
 	return (sysctl_handle_int(oidp, &count, 0, req));
 }
 
 static void
 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
     struct uma_percpu_stat *ups, bool internal)
 {
 	uma_zone_domain_t zdom;
 	uma_cache_t cache;
 	int i;
 
 
 	for (i = 0; i < vm_ndomains; i++) {
 		zdom = &z->uz_domain[i];
 		uth->uth_zone_free += zdom->uzd_nitems;
 	}
 	uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
 	uth->uth_frees = counter_u64_fetch(z->uz_frees);
 	uth->uth_fails = counter_u64_fetch(z->uz_fails);
 	uth->uth_sleeps = z->uz_sleeps;
+	uth->uth_xdomain = z->uz_xdomain;
 	/*
 	 * While it is not normally safe to access the cache
 	 * bucket pointers while not on the CPU that owns the
 	 * cache, we only allow the pointers to be exchanged
 	 * without the zone lock held, not invalidated, so
 	 * accept the possible race associated with bucket
 	 * exchange during monitoring.
 	 */
 	for (i = 0; i < mp_maxid + 1; i++) {
 		bzero(&ups[i], sizeof(*ups));
 		if (internal || CPU_ABSENT(i))
 			continue;
 		cache = &z->uz_cpu[i];
 		if (cache->uc_allocbucket != NULL)
 			ups[i].ups_cache_free +=
 			    cache->uc_allocbucket->ub_cnt;
 		if (cache->uc_freebucket != NULL)
 			ups[i].ups_cache_free +=
 			    cache->uc_freebucket->ub_cnt;
+		if (cache->uc_crossbucket != NULL)
+			ups[i].ups_cache_free +=
+			    cache->uc_crossbucket->ub_cnt;
 		ups[i].ups_allocs = cache->uc_allocs;
 		ups[i].ups_frees = cache->uc_frees;
 	}
 }
 
 static int
 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct uma_stream_header ush;
 	struct uma_type_header uth;
 	struct uma_percpu_stat *ups;
 	struct sbuf sbuf;
 	uma_keg_t kz;
 	uma_zone_t z;
 	int count, error, i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
 
 	count = 0;
 	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 
 	LIST_FOREACH(z, &uma_cachezones, uz_link)
 		count++;
 
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&ush, sizeof(ush));
 	ush.ush_version = UMA_STREAM_VERSION;
 	ush.ush_maxcpus = (mp_maxid + 1);
 	ush.ush_count = count;
 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
 
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			bzero(&uth, sizeof(uth));
 			ZONE_LOCK(z);
 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 			uth.uth_align = kz->uk_align;
 			uth.uth_size = kz->uk_size;
 			uth.uth_rsize = kz->uk_rsize;
 			if (z->uz_max_items > 0)
 				uth.uth_pages = (z->uz_items / kz->uk_ipers) *
 					kz->uk_ppera;
 			else
 				uth.uth_pages = kz->uk_pages;
 			uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
 			    kz->uk_ppera;
 			uth.uth_limit = z->uz_max_items;
 			uth.uth_keg_free = z->uz_keg->uk_free;
 
 			/*
 			 * A zone is secondary is it is not the first entry
 			 * on the keg's zone list.
 			 */
 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z))
 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
 			uma_vm_zone_stats(&uth, z, &sbuf, ups,
 			    kz->uk_flags & UMA_ZFLAG_INTERNAL);
 			ZONE_UNLOCK(z);
 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 			for (i = 0; i < mp_maxid + 1; i++)
 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 		}
 	}
 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
 		bzero(&uth, sizeof(uth));
 		ZONE_LOCK(z);
 		strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 		uth.uth_size = z->uz_size;
 		uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
 		ZONE_UNLOCK(z);
 		(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 		for (i = 0; i < mp_maxid + 1; i++)
 			(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 	}
 
 	rw_runlock(&uma_rwlock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	free(ups, M_TEMP);
 	return (error);
 }
 
 int
 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = *(uma_zone_t *)arg1;
 	int error, max;
 
 	max = uma_zone_get_max(zone);
 	error = sysctl_handle_int(oidp, &max, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	uma_zone_set_max(zone, max);
 
 	return (0);
 }
 
 int
 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = *(uma_zone_t *)arg1;
 	int cur;
 
 	cur = uma_zone_get_cur(zone);
 	return (sysctl_handle_int(oidp, &cur, 0, req));
 }
 
 #ifdef INVARIANTS
 static uma_slab_t
 uma_dbg_getslab(uma_zone_t zone, void *item)
 {
 	uma_slab_t slab;
 	uma_keg_t keg;
 	uint8_t *mem;
 
 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
 		slab = vtoslab((vm_offset_t)mem);
 	} else {
 		/*
 		 * It is safe to return the slab here even though the
 		 * zone is unlocked because the item's allocation state
 		 * essentially holds a reference.
 		 */
 		if (zone->uz_lockptr == &zone->uz_lock)
 			return (NULL);
 		ZONE_LOCK(zone);
 		keg = zone->uz_keg;
 		if (keg->uk_flags & UMA_ZONE_HASH)
 			slab = hash_sfind(&keg->uk_hash, mem);
 		else
 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
 		ZONE_UNLOCK(zone);
 	}
 
 	return (slab);
 }
 
 static bool
 uma_dbg_zskip(uma_zone_t zone, void *mem)
 {
 
 	if (zone->uz_lockptr == &zone->uz_lock)
 		return (true);
 
 	return (uma_dbg_kskip(zone->uz_keg, mem));
 }
 
 static bool
 uma_dbg_kskip(uma_keg_t keg, void *mem)
 {
 	uintptr_t idx;
 
 	if (dbg_divisor == 0)
 		return (true);
 
 	if (dbg_divisor == 1)
 		return (false);
 
 	idx = (uintptr_t)mem >> PAGE_SHIFT;
 	if (keg->uk_ipers > 1) {
 		idx *= keg->uk_ipers;
 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
 	}
 
 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
 		counter_u64_add(uma_skip_cnt, 1);
 		return (true);
 	}
 	counter_u64_add(uma_dbg_cnt, 1);
 
 	return (false);
 }
 
 /*
  * Set up the slab's freei data such that uma_dbg_free can function.
  *
  */
 static void
 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
 	int freei;
 
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
 			panic("uma: item %p did not belong to zone %s\n",
 			    item, zone->uz_name);
 	}
 	keg = slab->us_keg;
 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
 
 	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
 		    item, zone, zone->uz_name, slab, freei);
 	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
 
 	return;
 }
 
 /*
  * Verifies freed addresses.  Checks for alignment, valid slab membership
  * and duplicate frees.
  *
  */
 static void
 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
 	int freei;
 
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
 			panic("uma: Freed item %p did not belong to zone %s\n",
 			    item, zone->uz_name);
 	}
 	keg = slab->us_keg;
 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
 
 	if (freei >= keg->uk_ipers)
 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
 		    item, zone, zone->uz_name, slab, freei);
 
 	if (((freei * keg->uk_rsize) + slab->us_data) != item) 
 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
 		    item, zone, zone->uz_name, slab, freei);
 
 	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
 		    item, zone, zone->uz_name, slab, freei);
 
 	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
 }
 #endif /* INVARIANTS */
 
 #ifdef DDB
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
 	uma_keg_t kz;
 	uma_zone_t z;
-	uint64_t allocs, frees, sleeps;
+	uint64_t allocs, frees, sleeps, xdomain;
 	long cachefree;
 	int i;
 
-	db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
-	    "Free", "Requests", "Sleeps", "Bucket");
+	db_printf("%18s %8s %8s %8s %12s %8s %8s %8s\n", "Zone", "Size", "Used",
+	    "Free", "Requests", "Sleeps", "Bucket", "XFree");
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
 				allocs = counter_u64_fetch(z->uz_allocs);
 				frees = counter_u64_fetch(z->uz_frees);
 				sleeps = z->uz_sleeps;
 				cachefree = 0;
 			} else
 				uma_zone_sumstat(z, &cachefree, &allocs,
-				    &frees, &sleeps);
+				    &frees, &sleeps, &xdomain);
 			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z)))
 				cachefree += kz->uk_free;
 			for (i = 0; i < vm_ndomains; i++)
 				cachefree += z->uz_domain[i].uzd_nitems;
 
-			db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u\n",
+			db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u %8ju\n",
 			    z->uz_name, (uintmax_t)kz->uk_size,
 			    (intmax_t)(allocs - frees), cachefree,
-			    (uintmax_t)allocs, sleeps, z->uz_count);
+			    (uintmax_t)allocs, sleeps, z->uz_count, xdomain);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(umacache, db_show_umacache)
 {
 	uma_zone_t z;
 	uint64_t allocs, frees;
 	long cachefree;
 	int i;
 
 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
 	    "Requests", "Bucket");
 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
-		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
+		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
 		for (i = 0; i < vm_ndomains; i++)
 			cachefree += z->uz_domain[i].uzd_nitems;
 		db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
 		    z->uz_name, (uintmax_t)z->uz_size,
 		    (intmax_t)(allocs - frees), cachefree,
 		    (uintmax_t)allocs, z->uz_count);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif	/* DDB */
Index: head/sys/vm/uma_int.h
===================================================================
--- head/sys/vm/uma_int.h	(revision 350658)
+++ head/sys/vm/uma_int.h	(revision 350659)
@@ -1,498 +1,500 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #include <sys/counter.h>
 #include <sys/_bitset.h>
 #include <sys/_domainset.h>
 #include <sys/_task.h>
 
 /* 
  * This file includes definitions, structures, prototypes, and inlines that
  * should not be used outside of the actual implementation of UMA.
  */
 
 /* 
  * The brief summary;  Zones describe unique allocation types.  Zones are
  * organized into per-CPU caches which are filled by buckets.  Buckets are
  * organized according to memory domains.  Buckets are filled from kegs which
  * are also organized according to memory domains.  Kegs describe a unique
  * allocation type, backend memory provider, and layout.  Kegs are associated
  * with one or more zones and zones reference one or more kegs.  Kegs provide
  * slabs which are virtually contiguous collections of pages.  Each slab is
  * broken down int one or more items that will satisfy an individual allocation.
  *
  * Allocation is satisfied in the following order:
  * 1) Per-CPU cache
  * 2) Per-domain cache of buckets
  * 3) Slab from any of N kegs
  * 4) Backend page provider
  *
  * More detail on individual objects is contained below:
  *
  * Kegs contain lists of slabs which are stored in either the full bin, empty
  * bin, or partially allocated bin, to reduce fragmentation.  They also contain
  * the user supplied value for size, which is adjusted for alignment purposes
  * and rsize is the result of that.  The Keg also stores information for
  * managing a hash of page addresses that maps pages to uma_slab_t structures
  * for pages that don't have embedded uma_slab_t's.
  *
  * Keg slab lists are organized by memory domain to support NUMA allocation
  * policies.  By default allocations are spread across domains to reduce the
  * potential for hotspots.  Special keg creation flags may be specified to
  * prefer location allocation.  However there is no strict enforcement as frees
  * may happen on any CPU and these are returned to the CPU-local cache
  * regardless of the originating domain.
  *  
  * The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may
  * be allocated off the page from a special slab zone.  The free list within a
  * slab is managed with a bitmask.  For item sizes that would yield more than
  * 10% memory waste we potentially allocate a separate uma_slab_t if this will
  * improve the number of items per slab that will fit.  
  *
  * The only really gross cases, with regards to memory waste, are for those
  * items that are just over half the page size.   You can get nearly 50% waste,
  * so you fall back to the memory footprint of the power of two allocator. I
  * have looked at memory allocation sizes on many of the machines available to
  * me, and there does not seem to be an abundance of allocations at this range
  * so at this time it may not make sense to optimize for it.  This can, of 
  * course, be solved with dynamic slab sizes.
  *
  * Kegs may serve multiple Zones but by far most of the time they only serve
  * one.  When a Zone is created, a Keg is allocated and setup for it.  While
  * the backing Keg stores slabs, the Zone caches Buckets of items allocated
  * from the slabs.  Each Zone is equipped with an init/fini and ctor/dtor
  * pair, as well as with its own set of small per-CPU caches, layered above
  * the Zone's general Bucket cache.
  *
  * The PCPU caches are protected by critical sections, and may be accessed
  * safely only from their associated CPU, while the Zones backed by the same
  * Keg all share a common Keg lock (to coalesce contention on the backing
  * slabs).  The backing Keg typically only serves one Zone but in the case of
  * multiple Zones, one of the Zones is considered the Master Zone and all
  * Zone-related stats from the Keg are done in the Master Zone.  For an
  * example of a Multi-Zone setup, refer to the Mbuf allocation code.
  */
 
 /*
  *	This is the representation for normal (Non OFFPAGE slab)
  *
  *	i == item
  *	s == slab pointer
  *
  *	<----------------  Page (UMA_SLAB_SIZE) ------------------>
  *	___________________________________________________________
  *     | _  _  _  _  _  _  _  _  _  _  _  _  _  _  _   ___________ |
  *     ||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i| |slab header||
  *     ||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_| |___________|| 
  *     |___________________________________________________________|
  *
  *
  *	This is an OFFPAGE slab. These can be larger than UMA_SLAB_SIZE.
  *
  *	___________________________________________________________
  *     | _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _   |
  *     ||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i|  |
  *     ||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_|  |
  *     |___________________________________________________________|
  *       ___________    ^
  *	|slab header|   |
  *	|___________|---*
  *
  */
 
 #ifndef VM_UMA_INT_H
 #define VM_UMA_INT_H
 
 #define UMA_SLAB_SIZE	PAGE_SIZE	/* How big are our slabs? */
 #define UMA_SLAB_MASK	(PAGE_SIZE - 1)	/* Mask to get back to the page */
 #define UMA_SLAB_SHIFT	PAGE_SHIFT	/* Number of bits PAGE_MASK */
 
 /* Max waste percentage before going to off page slab management */
 #define UMA_MAX_WASTE	10
 
 /*
  * Actual size of uma_slab when it is placed at an end of a page
  * with pointer sized alignment requirement.
  */
 #define	SIZEOF_UMA_SLAB	((sizeof(struct uma_slab) & UMA_ALIGN_PTR) ?	  \
 			    (sizeof(struct uma_slab) & ~UMA_ALIGN_PTR) +  \
 			    (UMA_ALIGN_PTR + 1) : sizeof(struct uma_slab))
 
 /*
  * Size of memory in a not offpage single page slab available for actual items.
  */
 #define	UMA_SLAB_SPACE	(PAGE_SIZE - SIZEOF_UMA_SLAB)
 
 /*
  * I doubt there will be many cases where this is exceeded. This is the initial
  * size of the hash table for uma_slabs that are managed off page. This hash
  * does expand by powers of two.  Currently it doesn't get smaller.
  */
 #define UMA_HASH_SIZE_INIT	32		
 
 /* 
  * I should investigate other hashing algorithms.  This should yield a low
  * number of collisions if the pages are relatively contiguous.
  */
 
 #define UMA_HASH(h, s) ((((uintptr_t)s) >> UMA_SLAB_SHIFT) & (h)->uh_hashmask)
 
 #define UMA_HASH_INSERT(h, s, mem)					\
 		SLIST_INSERT_HEAD(&(h)->uh_slab_hash[UMA_HASH((h),	\
 		    (mem))], (s), us_hlink)
 #define UMA_HASH_REMOVE(h, s, mem)					\
 		SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h),		\
 		    (mem))], (s), uma_slab, us_hlink)
 
 /* Hash table for freed address -> slab translation */
 
 SLIST_HEAD(slabhead, uma_slab);
 
 struct uma_hash {
 	struct slabhead	*uh_slab_hash;	/* Hash table for slabs */
 	u_int		uh_hashsize;	/* Current size of the hash table */
 	u_int		uh_hashmask;	/* Mask used during hashing */
 };
 
 /*
  * align field or structure to cache line
  */
 #if defined(__amd64__) || defined(__powerpc64__)
 #define UMA_ALIGN	__aligned(128)
 #else
 #define UMA_ALIGN
 #endif
 
 /*
  * Structures for per cpu queues.
  */
 
 struct uma_bucket {
 	LIST_ENTRY(uma_bucket)	ub_link;	/* Link into the zone */
 	int16_t	ub_cnt;				/* Count of items in bucket. */
 	int16_t	ub_entries;			/* Max items. */
 	void	*ub_bucket[];			/* actual allocation storage */
 };
 
 typedef struct uma_bucket * uma_bucket_t;
 
 struct uma_cache {
 	uma_bucket_t	uc_freebucket;	/* Bucket we're freeing to */
 	uma_bucket_t	uc_allocbucket;	/* Bucket to allocate from */
+	uma_bucket_t	uc_crossbucket;	/* cross domain bucket */
 	uint64_t	uc_allocs;	/* Count of allocations */
 	uint64_t	uc_frees;	/* Count of frees */
 } UMA_ALIGN;
 
 typedef struct uma_cache * uma_cache_t;
 
 /*
  * Per-domain memory list.  Embedded in the kegs.
  */
 struct uma_domain {
 	LIST_HEAD(,uma_slab)	ud_part_slab;	/* partially allocated slabs */
 	LIST_HEAD(,uma_slab)	ud_free_slab;	/* empty slab list */
 	LIST_HEAD(,uma_slab)	ud_full_slab;	/* full slabs */
 };
 
 typedef struct uma_domain * uma_domain_t;
 
 /*
  * Keg management structure
  *
  * TODO: Optimize for cache line size
  *
  */
 struct uma_keg {
 	struct mtx	uk_lock;	/* Lock for the keg must be first.
 					 * See shared uz_keg/uz_lockptr
 					 * member of struct uma_zone. */
 	struct uma_hash	uk_hash;
 	LIST_HEAD(,uma_zone)	uk_zones;	/* Keg's zones */
 
 	struct domainset_ref uk_dr;	/* Domain selection policy. */
 	uint32_t	uk_align;	/* Alignment mask */
 	uint32_t	uk_pages;	/* Total page count */
 	uint32_t	uk_free;	/* Count of items free in slabs */
 	uint32_t	uk_reserve;	/* Number of reserved items. */
 	uint32_t	uk_size;	/* Requested size of each item */
 	uint32_t	uk_rsize;	/* Real size of each item */
 
 	uma_init	uk_init;	/* Keg's init routine */
 	uma_fini	uk_fini;	/* Keg's fini routine */
 	uma_alloc	uk_allocf;	/* Allocation function */
 	uma_free	uk_freef;	/* Free routine */
 
 	u_long		uk_offset;	/* Next free offset from base KVA */
 	vm_offset_t	uk_kva;		/* Zone base KVA */
 	uma_zone_t	uk_slabzone;	/* Slab zone backing us, if OFFPAGE */
 
 	uint32_t	uk_pgoff;	/* Offset to uma_slab struct */
 	uint16_t	uk_ppera;	/* pages per allocation from backend */
 	uint16_t	uk_ipers;	/* Items per slab */
 	uint32_t	uk_flags;	/* Internal flags */
 
 	/* Least used fields go to the last cache line. */
 	const char	*uk_name;		/* Name of creating zone. */
 	LIST_ENTRY(uma_keg)	uk_link;	/* List of all kegs */
 
 	/* Must be last, variable sized. */
 	struct uma_domain	uk_domain[];	/* Keg's slab lists. */
 };
 typedef struct uma_keg	* uma_keg_t;
 
 /*
  * Free bits per-slab.
  */
 #define	SLAB_SETSIZE	(PAGE_SIZE / UMA_SMALLEST_UNIT)
 BITSET_DEFINE(slabbits, SLAB_SETSIZE);
 
 /*
  * The slab structure manages a single contiguous allocation from backing
  * store and subdivides it into individually allocatable items.
  */
 struct uma_slab {
 	uma_keg_t	us_keg;			/* Keg we live in */
 	union {
 		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
 		unsigned long	_us_size;	/* Size of allocation */
 	} us_type;
 	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
 	uint8_t		*us_data;		/* First item */
 	struct slabbits	us_free;		/* Free bitmask. */
 #ifdef INVARIANTS
 	struct slabbits	us_debugfree;		/* Debug bitmask. */
 #endif
 	uint16_t	us_freecount;		/* How many are free? */
 	uint8_t		us_flags;		/* Page flags see uma.h */
 	uint8_t		us_domain;		/* Backing NUMA domain. */
 };
 
 #define	us_link	us_type._us_link
 #define	us_size	us_type._us_size
 
 #if MAXMEMDOM >= 255
 #error "Slab domain type insufficient"
 #endif
 
 typedef struct uma_slab * uma_slab_t;
 
 struct uma_zone_domain {
 	LIST_HEAD(,uma_bucket)	uzd_buckets;	/* full buckets */
 	long		uzd_nitems;	/* total item count */
 	long		uzd_imax;	/* maximum item count this period */
 	long		uzd_imin;	/* minimum item count this period */
 	long		uzd_wss;	/* working set size estimate */
 };
 
 typedef struct uma_zone_domain * uma_zone_domain_t;
 
 /*
  * Zone management structure 
  *
  * TODO: Optimize for cache line size
  *
  */
 struct uma_zone {
 	/* Offset 0, used in alloc/free fast/medium fast path and const. */
 	union {
 		uma_keg_t	uz_keg;		/* This zone's keg */
 		struct mtx 	*uz_lockptr;	/* To keg or to self */
 	};
 	struct uma_zone_domain	*uz_domain;	/* per-domain buckets */
 	uint32_t	uz_flags;	/* Flags inherited from kegs */
 	uint32_t	uz_size;	/* Size inherited from kegs */
 	uma_ctor	uz_ctor;	/* Constructor for each allocation */
 	uma_dtor	uz_dtor;	/* Destructor */
 	uint64_t	uz_items;	/* Total items count */
 	uint64_t	uz_max_items;	/* Maximum number of items to alloc */
 	uint32_t	uz_sleepers;	/* Number of sleepers on memory */
 	uint16_t	uz_count;	/* Amount of items in full bucket */
 	uint16_t	uz_count_max;	/* Maximum amount of items there */
 
 	/* Offset 64, used in bucket replenish. */
 	uma_import	uz_import;	/* Import new memory to cache. */
 	uma_release	uz_release;	/* Release memory from cache. */
 	void		*uz_arg;	/* Import/release argument. */
 	uma_init	uz_init;	/* Initializer for each item */
 	uma_fini	uz_fini;	/* Finalizer for each item. */
 	void		*uz_spare;
 	uint64_t	uz_bkt_count;    /* Items in bucket cache */
 	uint64_t	uz_bkt_max;	/* Maximum bucket cache size */
 
 	/* Offset 128 Rare. */
 	/*
 	 * The lock is placed here to avoid adjacent line prefetcher
 	 * in fast paths and to take up space near infrequently accessed
 	 * members to reduce alignment overhead.
 	 */
 	struct mtx	uz_lock;	/* Lock for the zone */
 	LIST_ENTRY(uma_zone) uz_link;	/* List of all zones in keg */
 	const char	*uz_name;	/* Text name of the zone */
 	/* The next two fields are used to print a rate-limited warnings. */
 	const char	*uz_warning;	/* Warning to print on failure */
 	struct timeval	uz_ratecheck;	/* Warnings rate-limiting */
 	struct task	uz_maxaction;	/* Task to run when at limit */
 	uint16_t	uz_count_min;	/* Minimal amount of items in bucket */
 
 	/* Offset 256, stats. */
 	counter_u64_t	uz_allocs;	/* Total number of allocations */
 	counter_u64_t	uz_frees;	/* Total number of frees */
 	counter_u64_t	uz_fails;	/* Total number of alloc failures */
 	uint64_t	uz_sleeps;	/* Total number of alloc sleeps */
+	uint64_t	uz_xdomain;	/* Total number of cross-domain frees */
 
 	/*
 	 * This HAS to be the last item because we adjust the zone size
 	 * based on NCPU and then allocate the space for the zones.
 	 */
 	struct uma_cache	uz_cpu[]; /* Per cpu caches */
 
 	/* uz_domain follows here. */
 };
 
 /*
  * These flags must not overlap with the UMA_ZONE flags specified in uma.h.
  */
 #define	UMA_ZFLAG_CACHE		0x04000000	/* uma_zcache_create()d it */
 #define	UMA_ZFLAG_DRAINING	0x08000000	/* Running zone_drain. */
 #define	UMA_ZFLAG_BUCKET	0x10000000	/* Bucket zone. */
 #define UMA_ZFLAG_INTERNAL	0x20000000	/* No offpage no PCPU. */
 #define UMA_ZFLAG_CACHEONLY	0x80000000	/* Don't ask VM for buckets. */
 
 #define	UMA_ZFLAG_INHERIT						\
     (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | UMA_ZFLAG_BUCKET)
 
 #undef UMA_ALIGN
 
 #ifdef _KERNEL
 /* Internal prototypes */
 static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data);
 void *uma_large_malloc(vm_size_t size, int wait);
 void *uma_large_malloc_domain(vm_size_t size, int domain, int wait);
 void uma_large_free(uma_slab_t slab);
 
 /* Lock Macros */
 
 #define	KEG_LOCK_INIT(k, lc)					\
 	do {							\
 		if ((lc))					\
 			mtx_init(&(k)->uk_lock, (k)->uk_name,	\
 			    (k)->uk_name, MTX_DEF | MTX_DUPOK);	\
 		else						\
 			mtx_init(&(k)->uk_lock, (k)->uk_name,	\
 			    "UMA zone", MTX_DEF | MTX_DUPOK);	\
 	} while (0)
 
 #define	KEG_LOCK_FINI(k)	mtx_destroy(&(k)->uk_lock)
 #define	KEG_LOCK(k)	mtx_lock(&(k)->uk_lock)
 #define	KEG_UNLOCK(k)	mtx_unlock(&(k)->uk_lock)
 #define	KEG_LOCK_ASSERT(k)	mtx_assert(&(k)->uk_lock, MA_OWNED)
 
 #define	KEG_GET(zone, keg) do {					\
 	(keg) = (zone)->uz_keg;					\
 	KASSERT((void *)(keg) != (void *)&(zone)->uz_lock,	\
 	    ("%s: Invalid zone %p type", __func__, (zone)));	\
 	} while (0)
 
 #define	ZONE_LOCK_INIT(z, lc)					\
 	do {							\
 		if ((lc))					\
 			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
 			    (z)->uz_name, MTX_DEF | MTX_DUPOK);	\
 		else						\
 			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
 			    "UMA zone", MTX_DEF | MTX_DUPOK);	\
 	} while (0)
 
 #define	ZONE_LOCK(z)	mtx_lock((z)->uz_lockptr)
 #define	ZONE_TRYLOCK(z)	mtx_trylock((z)->uz_lockptr)
 #define	ZONE_UNLOCK(z)	mtx_unlock((z)->uz_lockptr)
 #define	ZONE_LOCK_FINI(z)	mtx_destroy(&(z)->uz_lock)
 #define	ZONE_LOCK_ASSERT(z)	mtx_assert((z)->uz_lockptr, MA_OWNED)
 
 /*
  * Find a slab within a hash table.  This is used for OFFPAGE zones to lookup
  * the slab structure.
  *
  * Arguments:
  *	hash  The hash table to search.
  *	data  The base page of the item.
  *
  * Returns:
  *	A pointer to a slab if successful, else NULL.
  */
 static __inline uma_slab_t
 hash_sfind(struct uma_hash *hash, uint8_t *data)
 {
         uma_slab_t slab;
         u_int hval;
 
         hval = UMA_HASH(hash, data);
 
         SLIST_FOREACH(slab, &hash->uh_slab_hash[hval], us_hlink) {
                 if ((uint8_t *)slab->us_data == data)
                         return (slab);
         }
         return (NULL);
 }
 
 static __inline uma_slab_t
 vtoslab(vm_offset_t va)
 {
 	vm_page_t p;
 
 	p = PHYS_TO_VM_PAGE(pmap_kextract(va));
 	return ((uma_slab_t)p->plinks.s.pv);
 }
 
 static __inline void
 vsetslab(vm_offset_t va, uma_slab_t slab)
 {
 	vm_page_t p;
 
 	p = PHYS_TO_VM_PAGE(pmap_kextract(va));
 	p->plinks.s.pv = slab;
 }
 
 /*
  * The following two functions may be defined by architecture specific code
  * if they can provide more efficient allocation functions.  This is useful
  * for using direct mapped addresses.
  */
 void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
     uint8_t *pflag, int wait);
 void uma_small_free(void *mem, vm_size_t size, uint8_t flags);
 
 /* Set a global soft limit on UMA managed memory. */
 void uma_set_limit(unsigned long limit);
 #endif /* _KERNEL */
 
 #endif /* VM_UMA_INT_H */
Index: head/sys/vm/vm_phys.c
===================================================================
--- head/sys/vm/vm_phys.c	(revision 350658)
+++ head/sys/vm/vm_phys.c	(revision 350659)
@@ -1,1501 +1,1521 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  *	Physical memory system implementation
  *
  * Any external functions defined by this module are only to be used by the
  * virtual memory system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domainset.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/tree.h>
 #include <sys/vmmeter.h>
 
 #include <ddb/ddb.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 
 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
     "Too many physsegs.");
 
 #ifdef NUMA
 struct mem_affinity __read_mostly *mem_affinity;
 int __read_mostly *mem_locality;
 #endif
 
 int __read_mostly vm_ndomains = 1;
 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
 
 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
 int __read_mostly vm_phys_nsegs;
 
 struct vm_phys_fictitious_seg;
 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
     struct vm_phys_fictitious_seg *);
 
 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
     RB_INITIALIZER(_vm_phys_fictitious_tree);
 
 struct vm_phys_fictitious_seg {
 	RB_ENTRY(vm_phys_fictitious_seg) node;
 	/* Memory region data */
 	vm_paddr_t	start;
 	vm_paddr_t	end;
 	vm_page_t	first_page;
 };
 
 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
     vm_phys_fictitious_cmp);
 
 static struct rwlock_padalign vm_phys_fictitious_reg_lock;
 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 
 static struct vm_freelist __aligned(CACHE_LINE_SIZE)
     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
     [VM_NFREEORDER_MAX];
 
 static int __read_mostly vm_nfreelists;
 
 /*
  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
  */
 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
 
 CTASSERT(VM_FREELIST_DEFAULT == 0);
 
 #ifdef VM_FREELIST_DMA32
 #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
 #endif
 
 /*
  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
  * the ordering of the free list boundaries.
  */
 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
 #endif
 
 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
 
 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 
 #ifdef NUMA
 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
 #endif
 
 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
     &vm_ndomains, 0, "Number of physical memory domains available.");
 
 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary);
 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
     int order, int tail);
 
 /*
  * Red-black tree helpers for vm fictitious range management.
  */
 static inline int
 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
     struct vm_phys_fictitious_seg *range)
 {
 
 	KASSERT(range->start != 0 && range->end != 0,
 	    ("Invalid range passed on search for vm_fictitious page"));
 	if (p->start >= range->end)
 		return (1);
 	if (p->start < range->start)
 		return (-1);
 
 	return (0);
 }
 
 static int
 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
     struct vm_phys_fictitious_seg *p2)
 {
 
 	/* Check if this is a search for a page */
 	if (p1->end == 0)
 		return (vm_phys_fictitious_in_range(p1, p2));
 
 	KASSERT(p2->end != 0,
     ("Invalid range passed as second parameter to vm fictitious comparison"));
 
 	/* Searching to add a new range */
 	if (p1->end <= p2->start)
 		return (-1);
 	if (p1->start >= p2->end)
 		return (1);
 
 	panic("Trying to add overlapping vm fictitious ranges:\n"
 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
 }
 
 int
 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high)
 {
 #ifdef NUMA
 	domainset_t mask;
 	int i;
 
 	if (vm_ndomains == 1 || mem_affinity == NULL)
 		return (0);
 
 	DOMAINSET_ZERO(&mask);
 	/*
 	 * Check for any memory that overlaps low, high.
 	 */
 	for (i = 0; mem_affinity[i].end != 0; i++)
 		if (mem_affinity[i].start <= high &&
 		    mem_affinity[i].end >= low)
 			DOMAINSET_SET(mem_affinity[i].domain, &mask);
 	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
 		return (prefer);
 	if (DOMAINSET_EMPTY(&mask))
 		panic("vm_phys_domain_match:  Impossible constraint");
 	return (DOMAINSET_FFS(&mask) - 1);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Outputs the state of the physical memory allocator, specifically,
  * the amount of physical memory in each free list.
  */
 static int
 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct vm_freelist *fl;
 	int dom, error, flind, oind, pind;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
 		for (flind = 0; flind < vm_nfreelists; flind++) {
 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
 			    "\n  ORDER (SIZE)  |  NUMBER"
 			    "\n              ", flind);
 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
 			sbuf_printf(&sbuf, "\n--            ");
 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
 				sbuf_printf(&sbuf, "-- --      ");
 			sbuf_printf(&sbuf, "--\n");
 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
 				    1 << (PAGE_SHIFT - 10 + oind));
 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = vm_phys_free_queues[dom][flind][pind];
 					sbuf_printf(&sbuf, "  |  %6d",
 					    fl[oind].lcnt);
 				}
 				sbuf_printf(&sbuf, "\n");
 			}
 		}
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * Outputs the set of physical memory segments.
  */
 static int
 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct vm_phys_seg *seg;
 	int error, segind;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
 		seg = &vm_phys_segs[segind];
 		sbuf_printf(&sbuf, "start:     %#jx\n",
 		    (uintmax_t)seg->start);
 		sbuf_printf(&sbuf, "end:       %#jx\n",
 		    (uintmax_t)seg->end);
 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * Return affinity, or -1 if there's no affinity information.
  */
 int
 vm_phys_mem_affinity(int f, int t)
 {
 
 #ifdef NUMA
 	if (mem_locality == NULL)
 		return (-1);
 	if (f >= vm_ndomains || t >= vm_ndomains)
 		return (-1);
 	return (mem_locality[f * vm_ndomains + t]);
 #else
 	return (-1);
 #endif
 }
 
 #ifdef NUMA
 /*
  * Outputs the VM locality table.
  */
 static int
 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	int error, i, j;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 
 	sbuf_printf(&sbuf, "\n");
 
 	for (i = 0; i < vm_ndomains; i++) {
 		sbuf_printf(&sbuf, "%d: ", i);
 		for (j = 0; j < vm_ndomains; j++) {
 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
 		}
 		sbuf_printf(&sbuf, "\n");
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 #endif
 
 static void
 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 {
 
 	m->order = order;
 	if (tail)
 		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
 	else
 		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
 	fl[order].lcnt++;
 }
 
 static void
 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
 {
 
 	TAILQ_REMOVE(&fl[order].pl, m, listq);
 	fl[order].lcnt--;
 	m->order = VM_NFREEORDER;
 }
 
 /*
  * Create a physical memory segment.
  */
 static void
 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
 {
 	struct vm_phys_seg *seg;
 
 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("vm_phys_create_seg: invalid domain provided"));
 	seg = &vm_phys_segs[vm_phys_nsegs++];
 	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
 		*seg = *(seg - 1);
 		seg--;
 	}
 	seg->start = start;
 	seg->end = end;
 	seg->domain = domain;
 }
 
 static void
 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
 {
 #ifdef NUMA
 	int i;
 
 	if (mem_affinity == NULL) {
 		_vm_phys_create_seg(start, end, 0);
 		return;
 	}
 
 	for (i = 0;; i++) {
 		if (mem_affinity[i].end == 0)
 			panic("Reached end of affinity info");
 		if (mem_affinity[i].end <= start)
 			continue;
 		if (mem_affinity[i].start > start)
 			panic("No affinity info for start %jx",
 			    (uintmax_t)start);
 		if (mem_affinity[i].end >= end) {
 			_vm_phys_create_seg(start, end,
 			    mem_affinity[i].domain);
 			break;
 		}
 		_vm_phys_create_seg(start, mem_affinity[i].end,
 		    mem_affinity[i].domain);
 		start = mem_affinity[i].end;
 	}
 #else
 	_vm_phys_create_seg(start, end, 0);
 #endif
 }
 
 /*
  * Add a physical memory segment.
  */
 void
 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
 {
 	vm_paddr_t paddr;
 
 	KASSERT((start & PAGE_MASK) == 0,
 	    ("vm_phys_define_seg: start is not page aligned"));
 	KASSERT((end & PAGE_MASK) == 0,
 	    ("vm_phys_define_seg: end is not page aligned"));
 
 	/*
 	 * Split the physical memory segment if it spans two or more free
 	 * list boundaries.
 	 */
 	paddr = start;
 #ifdef	VM_FREELIST_LOWMEM
 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
 		paddr = VM_LOWMEM_BOUNDARY;
 	}
 #endif
 #ifdef	VM_FREELIST_DMA32
 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
 		paddr = VM_DMA32_BOUNDARY;
 	}
 #endif
 	vm_phys_create_seg(paddr, end);
 }
 
 /*
  * Initialize the physical memory allocator.
  *
  * Requires that vm_page_array is initialized!
  */
 void
 vm_phys_init(void)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
 	u_long npages;
 	int dom, flind, freelist, oind, pind, segind;
 
 	/*
 	 * Compute the number of free lists, and generate the mapping from the
 	 * manifest constants VM_FREELIST_* to the free list indices.
 	 *
 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
 	 * 0 or 1 to indicate which free lists should be created.
 	 */
 	npages = 0;
 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 		seg = &vm_phys_segs[segind];
 #ifdef	VM_FREELIST_LOWMEM
 		if (seg->end <= VM_LOWMEM_BOUNDARY)
 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
 		else
 #endif
 #ifdef	VM_FREELIST_DMA32
 		if (
 #ifdef	VM_DMA32_NPAGES_THRESHOLD
 		    /*
 		     * Create the DMA32 free list only if the amount of
 		     * physical memory above physical address 4G exceeds the
 		     * given threshold.
 		     */
 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
 #endif
 		    seg->end <= VM_DMA32_BOUNDARY)
 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
 		else
 #endif
 		{
 			npages += atop(seg->end - seg->start);
 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
 		}
 	}
 	/* Change each entry into a running total of the free lists. */
 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
 		vm_freelist_to_flind[freelist] +=
 		    vm_freelist_to_flind[freelist - 1];
 	}
 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
 	/* Change each entry into a free list index. */
 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
 		vm_freelist_to_flind[freelist]--;
 
 	/*
 	 * Initialize the first_page and free_queues fields of each physical
 	 * memory segment.
 	 */
 #ifdef VM_PHYSSEG_SPARSE
 	npages = 0;
 #endif
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 #ifdef VM_PHYSSEG_SPARSE
 		seg->first_page = &vm_page_array[npages];
 		npages += atop(seg->end - seg->start);
 #else
 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
 #endif
 #ifdef	VM_FREELIST_LOWMEM
 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
 			KASSERT(flind >= 0,
 			    ("vm_phys_init: LOWMEM flind < 0"));
 		} else
 #endif
 #ifdef	VM_FREELIST_DMA32
 		if (seg->end <= VM_DMA32_BOUNDARY) {
 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
 			KASSERT(flind >= 0,
 			    ("vm_phys_init: DMA32 flind < 0"));
 		} else
 #endif
 		{
 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
 			KASSERT(flind >= 0,
 			    ("vm_phys_init: DEFAULT flind < 0"));
 		}
 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
 	}
 
 	/*
 	 * Coalesce physical memory segments that are contiguous and share the
 	 * same per-domain free queues.
 	 */
 	prev_seg = vm_phys_segs;
 	seg = &vm_phys_segs[1];
 	end_seg = &vm_phys_segs[vm_phys_nsegs];
 	while (seg < end_seg) {
 		if (prev_seg->end == seg->start &&
 		    prev_seg->free_queues == seg->free_queues) {
 			prev_seg->end = seg->end;
 			KASSERT(prev_seg->domain == seg->domain,
 			    ("vm_phys_init: free queues cannot span domains"));
 			vm_phys_nsegs--;
 			end_seg--;
 			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
 				*tmp_seg = *(tmp_seg + 1);
 		} else {
 			prev_seg = seg;
 			seg++;
 		}
 	}
 
 	/*
 	 * Initialize the free queues.
 	 */
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		for (flind = 0; flind < vm_nfreelists; flind++) {
 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = vm_phys_free_queues[dom][flind][pind];
 				for (oind = 0; oind < VM_NFREEORDER; oind++)
 					TAILQ_INIT(&fl[oind].pl);
 			}
 		}
 	}
 
 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 }
 
 /*
  * Register info about the NUMA topology of the system.
  *
  * Invoked by platform-dependent code prior to vm_phys_init().
  */
 void
 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
     int *locality)
 {
 #ifdef NUMA
 	int d, i;
 
 	/*
 	 * For now the only override value that we support is 1, which
 	 * effectively disables NUMA-awareness in the allocators.
 	 */
 	d = 0;
 	TUNABLE_INT_FETCH("vm.numa.disabled", &d);
 	if (d)
 		ndomains = 1;
 
 	if (ndomains > 1) {
 		vm_ndomains = ndomains;
 		mem_affinity = affinity;
 		mem_locality = locality;
 	}
 
 	for (i = 0; i < vm_ndomains; i++)
 		DOMAINSET_SET(i, &all_domains);
 #else
 	(void)ndomains;
 	(void)affinity;
 	(void)locality;
 #endif
 }
 
+int
+_vm_phys_domain(vm_paddr_t pa)
+{
+#ifdef NUMA
+	int i;
+
+	if (vm_ndomains == 1 || mem_affinity == NULL)
+		return (0);
+
+	/*
+	 * Check for any memory that overlaps.
+	 */
+	for (i = 0; mem_affinity[i].end != 0; i++)
+		if (mem_affinity[i].start <= pa &&
+		    mem_affinity[i].end >= pa)
+			return (mem_affinity[i].domain);
+#endif
+	return (0);
+}
+
 /*
  * Split a contiguous, power of two-sized set of physical pages.
  *
  * When this function is called by a page allocation function, the caller
  * should request insertion at the head unless the order [order, oind) queues
  * are known to be empty.  The objective being to reduce the likelihood of
  * long-term fragmentation by promoting contemporaneous allocation and
  * (hopefully) deallocation.
  */
 static __inline void
 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
     int tail)
 {
 	vm_page_t m_buddy;
 
 	while (oind > order) {
 		oind--;
 		m_buddy = &m[1 << oind];
 		KASSERT(m_buddy->order == VM_NFREEORDER,
 		    ("vm_phys_split_pages: page %p has unexpected order %d",
 		    m_buddy, m_buddy->order));
 		vm_freelist_add(fl, m_buddy, oind, tail);
         }
 }
 
 /*
  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
  * and sized set to the specified free list.
  *
  * When this function is called by a page allocation function, the caller
  * should request insertion at the head unless the lower-order queues are
  * known to be empty.  The objective being to reduce the likelihood of long-
  * term fragmentation by promoting contemporaneous allocation and (hopefully)
  * deallocation.
  *
  * The physical page m's buddy must not be free.
  */
 static void
 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
 {
 	u_int n;
 	int order;
 
 	KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0"));
 	KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
 	    ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0,
 	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
 	    m, npages));
 	do {
 		KASSERT(m->order == VM_NFREEORDER,
 		    ("vm_phys_enq_range: page %p has unexpected order %d",
 		    m, m->order));
 		order = ffs(npages) - 1;
 		KASSERT(order < VM_NFREEORDER,
 		    ("vm_phys_enq_range: order %d is out of range", order));
 		vm_freelist_add(fl, m, order, tail);
 		n = 1 << order;
 		m += n;
 		npages -= n;
 	} while (npages > 0);
 }
 
 /*
  * Tries to allocate the specified number of pages from the specified pool
  * within the specified domain.  Returns the actual number of allocated pages
  * and a pointer to each page through the array ma[].
  *
  * The returned pages may not be physically contiguous.  However, in contrast
  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
  * calling this function once to allocate the desired number of pages will
  * avoid wasted time in vm_phys_split_pages().
  *
  * The free page queues for the specified domain must be locked.
  */
 int
 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
 {
 	struct vm_freelist *alt, *fl;
 	vm_page_t m;
 	int avail, end, flind, freelist, i, need, oind, pind;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	i = 0;
 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
 		flind = vm_freelist_to_flind[freelist];
 		if (flind < 0)
 			continue;
 		fl = vm_phys_free_queues[domain][flind][pool];
 		for (oind = 0; oind < VM_NFREEORDER; oind++) {
 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
 				vm_freelist_rem(fl, m, oind);
 				avail = 1 << oind;
 				need = imin(npages - i, avail);
 				for (end = i + need; i < end;)
 					ma[i++] = m++;
 				if (need < avail) {
 					/*
 					 * Return excess pages to fl.  Its
 					 * order [0, oind) queues are empty.
 					 */
 					vm_phys_enq_range(m, avail - need, fl,
 					    1);
 					return (npages);
 				} else if (i == npages)
 					return (npages);
 			}
 		}
 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				alt = vm_phys_free_queues[domain][flind][pind];
 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
 				    NULL) {
 					vm_freelist_rem(alt, m, oind);
 					vm_phys_set_pool(pool, m, oind);
 					avail = 1 << oind;
 					need = imin(npages - i, avail);
 					for (end = i + need; i < end;)
 						ma[i++] = m++;
 					if (need < avail) {
 						/*
 						 * Return excess pages to fl.
 						 * Its order [0, oind) queues
 						 * are empty.
 						 */
 						vm_phys_enq_range(m, avail -
 						    need, fl, 1);
 						return (npages);
 					} else if (i == npages)
 						return (npages);
 				}
 			}
 		}
 	}
 	return (i);
 }
 
 /*
  * Allocate a contiguous, power of two-sized set of physical pages
  * from the free lists.
  *
  * The free page queues must be locked.
  */
 vm_page_t
 vm_phys_alloc_pages(int domain, int pool, int order)
 {
 	vm_page_t m;
 	int freelist;
 
 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
 		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
 		if (m != NULL)
 			return (m);
 	}
 	return (NULL);
 }
 
 /*
  * Allocate a contiguous, power of two-sized set of physical pages from the
  * specified free list.  The free list must be specified using one of the
  * manifest constants VM_FREELIST_*.
  *
  * The free page queues must be locked.
  */
 vm_page_t
 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
 {
 	struct vm_freelist *alt, *fl;
 	vm_page_t m;
 	int oind, pind, flind;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
 	    domain));
 	KASSERT(freelist < VM_NFREELIST,
 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
 	    freelist));
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
 
 	flind = vm_freelist_to_flind[freelist];
 	/* Check if freelist is present */
 	if (flind < 0)
 		return (NULL);
 
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	fl = &vm_phys_free_queues[domain][flind][pool][0];
 	for (oind = order; oind < VM_NFREEORDER; oind++) {
 		m = TAILQ_FIRST(&fl[oind].pl);
 		if (m != NULL) {
 			vm_freelist_rem(fl, m, oind);
 			/* The order [order, oind) queues are empty. */
 			vm_phys_split_pages(m, oind, fl, order, 1);
 			return (m);
 		}
 	}
 
 	/*
 	 * The given pool was empty.  Find the largest
 	 * contiguous, power-of-two-sized set of pages in any
 	 * pool.  Transfer these pages to the given pool, and
 	 * use them to satisfy the allocation.
 	 */
 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 			alt = &vm_phys_free_queues[domain][flind][pind][0];
 			m = TAILQ_FIRST(&alt[oind].pl);
 			if (m != NULL) {
 				vm_freelist_rem(alt, m, oind);
 				vm_phys_set_pool(pool, m, oind);
 				/* The order [order, oind) queues are empty. */
 				vm_phys_split_pages(m, oind, fl, order, 1);
 				return (m);
 			}
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Find the vm_page corresponding to the given physical address.
  */
 vm_page_t
 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
 {
 	struct vm_phys_seg *seg;
 	int segind;
 
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		if (pa >= seg->start && pa < seg->end)
 			return (&seg->first_page[atop(pa - seg->start)]);
 	}
 	return (NULL);
 }
 
 vm_page_t
 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 {
 	struct vm_phys_fictitious_seg tmp, *seg;
 	vm_page_t m;
 
 	m = NULL;
 	tmp.start = pa;
 	tmp.end = 0;
 
 	rw_rlock(&vm_phys_fictitious_reg_lock);
 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 	rw_runlock(&vm_phys_fictitious_reg_lock);
 	if (seg == NULL)
 		return (NULL);
 
 	m = &seg->first_page[atop(pa - seg->start)];
 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
 
 	return (m);
 }
 
 static inline void
 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
     long page_count, vm_memattr_t memattr)
 {
 	long i;
 
 	bzero(range, page_count * sizeof(*range));
 	for (i = 0; i < page_count; i++) {
 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
 		range[i].oflags &= ~VPO_UNMANAGED;
 		range[i].busy_lock = VPB_UNBUSIED;
 	}
 }
 
 int
 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
     vm_memattr_t memattr)
 {
 	struct vm_phys_fictitious_seg *seg;
 	vm_page_t fp;
 	long page_count;
 #ifdef VM_PHYSSEG_DENSE
 	long pi, pe;
 	long dpage_count;
 #endif
 
 	KASSERT(start < end,
 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
 	    (uintmax_t)start, (uintmax_t)end));
 
 	page_count = (end - start) / PAGE_SIZE;
 
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
 	pe = atop(end);
 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		fp = &vm_page_array[pi - first_page];
 		if ((pe - first_page) > vm_page_array_size) {
 			/*
 			 * We have a segment that starts inside
 			 * of vm_page_array, but ends outside of it.
 			 *
 			 * Use vm_page_array pages for those that are
 			 * inside of the vm_page_array range, and
 			 * allocate the remaining ones.
 			 */
 			dpage_count = vm_page_array_size - (pi - first_page);
 			vm_phys_fictitious_init_range(fp, start, dpage_count,
 			    memattr);
 			page_count -= dpage_count;
 			start += ptoa(dpage_count);
 			goto alloc;
 		}
 		/*
 		 * We can allocate the full range from vm_page_array,
 		 * so there's no need to register the range in the tree.
 		 */
 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 		return (0);
 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 		/*
 		 * We have a segment that ends inside of vm_page_array,
 		 * but starts outside of it.
 		 */
 		fp = &vm_page_array[0];
 		dpage_count = pe - first_page;
 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
 		    memattr);
 		end -= ptoa(dpage_count);
 		page_count -= dpage_count;
 		goto alloc;
 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 		/*
 		 * Trying to register a fictitious range that expands before
 		 * and after vm_page_array.
 		 */
 		return (EINVAL);
 	} else {
 alloc:
 #endif
 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
 		    M_WAITOK);
 #ifdef VM_PHYSSEG_DENSE
 	}
 #endif
 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 
 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
 	seg->start = start;
 	seg->end = end;
 	seg->first_page = fp;
 
 	rw_wlock(&vm_phys_fictitious_reg_lock);
 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
 	rw_wunlock(&vm_phys_fictitious_reg_lock);
 
 	return (0);
 }
 
 void
 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
 {
 	struct vm_phys_fictitious_seg *seg, tmp;
 #ifdef VM_PHYSSEG_DENSE
 	long pi, pe;
 #endif
 
 	KASSERT(start < end,
 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
 	    (uintmax_t)start, (uintmax_t)end));
 
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
 	pe = atop(end);
 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		if ((pe - first_page) <= vm_page_array_size) {
 			/*
 			 * This segment was allocated using vm_page_array
 			 * only, there's nothing to do since those pages
 			 * were never added to the tree.
 			 */
 			return;
 		}
 		/*
 		 * We have a segment that starts inside
 		 * of vm_page_array, but ends outside of it.
 		 *
 		 * Calculate how many pages were added to the
 		 * tree and free them.
 		 */
 		start = ptoa(first_page + vm_page_array_size);
 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 		/*
 		 * We have a segment that ends inside of vm_page_array,
 		 * but starts outside of it.
 		 */
 		end = ptoa(first_page);
 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 		/* Since it's not possible to register such a range, panic. */
 		panic(
 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
 		    (uintmax_t)start, (uintmax_t)end);
 	}
 #endif
 	tmp.start = start;
 	tmp.end = 0;
 
 	rw_wlock(&vm_phys_fictitious_reg_lock);
 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 	if (seg->start != start || seg->end != end) {
 		rw_wunlock(&vm_phys_fictitious_reg_lock);
 		panic(
 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
 		    (uintmax_t)start, (uintmax_t)end);
 	}
 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
 	rw_wunlock(&vm_phys_fictitious_reg_lock);
 	free(seg->first_page, M_FICT_PAGES);
 	free(seg, M_FICT_PAGES);
 }
 
 /*
  * Free a contiguous, power of two-sized set of physical pages.
  *
  * The free page queues must be locked.
  */
 void
 vm_phys_free_pages(vm_page_t m, int order)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
 	vm_paddr_t pa;
 	vm_page_t m_buddy;
 
 	KASSERT(m->order == VM_NFREEORDER,
 	    ("vm_phys_free_pages: page %p has unexpected order %d",
 	    m, m->order));
 	KASSERT(m->pool < VM_NFREEPOOL,
 	    ("vm_phys_free_pages: page %p has unexpected pool %d",
 	    m, m->pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_free_pages: order %d is out of range", order));
 	seg = &vm_phys_segs[m->segind];
 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
 	if (order < VM_NFREEORDER - 1) {
 		pa = VM_PAGE_TO_PHYS(m);
 		do {
 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
 			if (pa < seg->start || pa >= seg->end)
 				break;
 			m_buddy = &seg->first_page[atop(pa - seg->start)];
 			if (m_buddy->order != order)
 				break;
 			fl = (*seg->free_queues)[m_buddy->pool];
 			vm_freelist_rem(fl, m_buddy, order);
 			if (m_buddy->pool != m->pool)
 				vm_phys_set_pool(m->pool, m_buddy, order);
 			order++;
 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
 			m = &seg->first_page[atop(pa - seg->start)];
 		} while (order < VM_NFREEORDER - 1);
 	}
 	fl = (*seg->free_queues)[m->pool];
 	vm_freelist_add(fl, m, order, 1);
 }
 
 /*
  * Return the largest possible order of a set of pages starting at m.
  */
 static int
 max_order(vm_page_t m)
 {
 
 	/*
 	 * Unsigned "min" is used here so that "order" is assigned
 	 * "VM_NFREEORDER - 1" when "m"'s physical address is zero
 	 * or the low-order bits of its physical address are zero
 	 * because the size of a physical address exceeds the size of
 	 * a long.
 	 */
 	return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
 	    VM_NFREEORDER - 1));
 }
 
 /*
  * Free a contiguous, arbitrarily sized set of physical pages, without
  * merging across set boundaries.
  *
  * The free page queues must be locked.
  */
 void
 vm_phys_enqueue_contig(vm_page_t m, u_long npages)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
 	vm_page_t m_end;
 	int order;
 
 	/*
 	 * Avoid unnecessary coalescing by freeing the pages in the largest
 	 * possible power-of-two-sized subsets.
 	 */
 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
 	seg = &vm_phys_segs[m->segind];
 	fl = (*seg->free_queues)[m->pool];
 	m_end = m + npages;
 	/* Free blocks of increasing size. */
 	while ((order = max_order(m)) < VM_NFREEORDER - 1 &&
 	    m + (1 << order) <= m_end) {
 		KASSERT(seg == &vm_phys_segs[m->segind],
 		    ("%s: page range [%p,%p) spans multiple segments",
 		    __func__, m_end - npages, m));
 		vm_freelist_add(fl, m, order, 1);
 		m += 1 << order;
 	}
 	/* Free blocks of maximum size. */
 	while (m + (1 << order) <= m_end) {
 		KASSERT(seg == &vm_phys_segs[m->segind],
 		    ("%s: page range [%p,%p) spans multiple segments",
 		    __func__, m_end - npages, m));
 		vm_freelist_add(fl, m, order, 1);
 		m += 1 << order;
 	}
 	/* Free blocks of diminishing size. */
 	while (m < m_end) {
 		KASSERT(seg == &vm_phys_segs[m->segind],
 		    ("%s: page range [%p,%p) spans multiple segments",
 		    __func__, m_end - npages, m));
 		order = flsl(m_end - m) - 1;
 		vm_freelist_add(fl, m, order, 1);
 		m += 1 << order;
 	}
 }
 
 /*
  * Free a contiguous, arbitrarily sized set of physical pages.
  *
  * The free page queues must be locked.
  */
 void
 vm_phys_free_contig(vm_page_t m, u_long npages)
 {
 	int order_start, order_end;
 	vm_page_t m_start, m_end;
 
 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
 
 	m_start = m;
 	order_start = max_order(m_start);
 	if (order_start < VM_NFREEORDER - 1)
 		m_start += 1 << order_start;
 	m_end = m + npages;
 	order_end = max_order(m_end);
 	if (order_end < VM_NFREEORDER - 1)
 		m_end -= 1 << order_end;
 	/*
 	 * Avoid unnecessary coalescing by freeing the pages at the start and
 	 * end of the range last.
 	 */
 	if (m_start < m_end)
 		vm_phys_enqueue_contig(m_start, m_end - m_start);
 	if (order_start < VM_NFREEORDER - 1)
 		vm_phys_free_pages(m, order_start);
 	if (order_end < VM_NFREEORDER - 1)
 		vm_phys_free_pages(m_end, order_end);
 }
 
 /*
  * Scan physical memory between the specified addresses "low" and "high" for a
  * run of contiguous physical pages that satisfy the specified conditions, and
  * return the lowest page in the run.  The specified "alignment" determines
  * the alignment of the lowest physical page in the run.  If the specified
  * "boundary" is non-zero, then the run of physical pages cannot span a
  * physical address that is a multiple of "boundary".
  *
  * "npages" must be greater than zero.  Both "alignment" and "boundary" must
  * be a power of two.
  */
 vm_page_t
 vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary, int options)
 {
 	vm_paddr_t pa_end;
 	vm_page_t m_end, m_run, m_start;
 	struct vm_phys_seg *seg;
 	int segind;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	if (low >= high)
 		return (NULL);
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		if (seg->domain != domain)
 			continue;
 		if (seg->start >= high)
 			break;
 		if (low >= seg->end)
 			continue;
 		if (low <= seg->start)
 			m_start = seg->first_page;
 		else
 			m_start = &seg->first_page[atop(low - seg->start)];
 		if (high < seg->end)
 			pa_end = high;
 		else
 			pa_end = seg->end;
 		if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
 			continue;
 		m_end = &seg->first_page[atop(pa_end - seg->start)];
 		m_run = vm_page_scan_contig(npages, m_start, m_end,
 		    alignment, boundary, options);
 		if (m_run != NULL)
 			return (m_run);
 	}
 	return (NULL);
 }
 
 /*
  * Set the pool for a contiguous, power of two-sized set of physical pages. 
  */
 void
 vm_phys_set_pool(int pool, vm_page_t m, int order)
 {
 	vm_page_t m_tmp;
 
 	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
 		m_tmp->pool = pool;
 }
 
 /*
  * Search for the given physical page "m" in the free lists.  If the search
  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
  * FALSE, indicating that "m" is not in the free lists.
  *
  * The free page queues must be locked.
  */
 boolean_t
 vm_phys_unfree_page(vm_page_t m)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
 	vm_paddr_t pa, pa_half;
 	vm_page_t m_set, m_tmp;
 	int order;
 
 	/*
 	 * First, find the contiguous, power of two-sized set of free
 	 * physical pages containing the given physical page "m" and
 	 * assign it to "m_set".
 	 */
 	seg = &vm_phys_segs[m->segind];
 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
 	    order < VM_NFREEORDER - 1; ) {
 		order++;
 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
 		if (pa >= seg->start)
 			m_set = &seg->first_page[atop(pa - seg->start)];
 		else
 			return (FALSE);
 	}
 	if (m_set->order < order)
 		return (FALSE);
 	if (m_set->order == VM_NFREEORDER)
 		return (FALSE);
 	KASSERT(m_set->order < VM_NFREEORDER,
 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
 	    m_set, m_set->order));
 
 	/*
 	 * Next, remove "m_set" from the free lists.  Finally, extract
 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
 	 * is larger than a page, shrink "m_set" by returning the half
 	 * of "m_set" that does not contain "m" to the free lists.
 	 */
 	fl = (*seg->free_queues)[m_set->pool];
 	order = m_set->order;
 	vm_freelist_rem(fl, m_set, order);
 	while (order > 0) {
 		order--;
 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
 		if (m->phys_addr < pa_half)
 			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
 		else {
 			m_tmp = m_set;
 			m_set = &seg->first_page[atop(pa_half - seg->start)];
 		}
 		vm_freelist_add(fl, m_tmp, order, 0);
 	}
 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
 	return (TRUE);
 }
 
 /*
  * Allocate a contiguous set of physical pages of the given size
  * "npages" from the free lists.  All of the physical pages must be at
  * or above the given physical address "low" and below the given
  * physical address "high".  The given value "alignment" determines the
  * alignment of the first physical page in the set.  If the given value
  * "boundary" is non-zero, then the set of physical pages cannot cross
  * any physical address boundary that is a multiple of that value.  Both
  * "alignment" and "boundary" must be a power of two.
  */
 vm_page_t
 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
 	vm_paddr_t pa_end, pa_start;
 	vm_page_t m_run;
 	struct vm_phys_seg *seg;
 	int segind;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	if (low >= high)
 		return (NULL);
 	m_run = NULL;
 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 		seg = &vm_phys_segs[segind];
 		if (seg->start >= high || seg->domain != domain)
 			continue;
 		if (low >= seg->end)
 			break;
 		if (low <= seg->start)
 			pa_start = seg->start;
 		else
 			pa_start = low;
 		if (high < seg->end)
 			pa_end = high;
 		else
 			pa_end = seg->end;
 		if (pa_end - pa_start < ptoa(npages))
 			continue;
 		m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
 		    alignment, boundary);
 		if (m_run != NULL)
 			break;
 	}
 	return (m_run);
 }
 
 /*
  * Allocate a run of contiguous physical pages from the free list for the
  * specified segment.
  */
 static vm_page_t
 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 {
 	struct vm_freelist *fl;
 	vm_paddr_t pa, pa_end, size;
 	vm_page_t m, m_ret;
 	u_long npages_end;
 	int oind, order, pind;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
 	/* Compute the queue that is the best fit for npages. */
 	order = flsl(npages - 1);
 	/* Search for a run satisfying the specified conditions. */
 	size = npages << PAGE_SHIFT;
 	for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
 	    oind++) {
 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 			fl = (*seg->free_queues)[pind];
 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
 				/*
 				 * Is the size of this allocation request
 				 * larger than the largest block size?
 				 */
 				if (order >= VM_NFREEORDER) {
 					/*
 					 * Determine if a sufficient number of
 					 * subsequent blocks to satisfy the
 					 * allocation request are free.
 					 */
 					pa = VM_PAGE_TO_PHYS(m_ret);
 					pa_end = pa + size;
 					if (pa_end < pa)
 						continue;
 					for (;;) {
 						pa += 1 << (PAGE_SHIFT +
 						    VM_NFREEORDER - 1);
 						if (pa >= pa_end ||
 						    pa < seg->start ||
 						    pa >= seg->end)
 							break;
 						m = &seg->first_page[atop(pa -
 						    seg->start)];
 						if (m->order != VM_NFREEORDER -
 						    1)
 							break;
 					}
 					/* If not, go to the next block. */
 					if (pa < pa_end)
 						continue;
 				}
 
 				/*
 				 * Determine if the blocks are within the
 				 * given range, satisfy the given alignment,
 				 * and do not cross the given boundary.
 				 */
 				pa = VM_PAGE_TO_PHYS(m_ret);
 				pa_end = pa + size;
 				if (pa >= low && pa_end <= high &&
 				    (pa & (alignment - 1)) == 0 &&
 				    rounddown2(pa ^ (pa_end - 1), boundary) == 0)
 					goto done;
 			}
 		}
 	}
 	return (NULL);
 done:
 	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
 		fl = (*seg->free_queues)[m->pool];
 		vm_freelist_rem(fl, m, oind);
 		if (m->pool != VM_FREEPOOL_DEFAULT)
 			vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
 	}
 	/* Return excess pages to the free lists. */
 	npages_end = roundup2(npages, 1 << oind);
 	if (npages < npages_end) {
 		fl = (*seg->free_queues)[VM_FREEPOOL_DEFAULT];
 		vm_phys_enq_range(&m_ret[npages], npages_end - npages, fl, 0);
 	}
 	return (m_ret);
 }
 
 #ifdef DDB
 /*
  * Show the number of physical pages in each of the free lists.
  */
 DB_SHOW_COMMAND(freepages, db_show_freepages)
 {
 	struct vm_freelist *fl;
 	int flind, oind, pind, dom;
 
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		db_printf("DOMAIN: %d\n", dom);
 		for (flind = 0; flind < vm_nfreelists; flind++) {
 			db_printf("FREE LIST %d:\n"
 			    "\n  ORDER (SIZE)  |  NUMBER"
 			    "\n              ", flind);
 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
 				db_printf("  |  POOL %d", pind);
 			db_printf("\n--            ");
 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
 				db_printf("-- --      ");
 			db_printf("--\n");
 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 				db_printf("  %2.2d (%6.6dK)", oind,
 				    1 << (PAGE_SHIFT - 10 + oind));
 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = vm_phys_free_queues[dom][flind][pind];
 					db_printf("  |  %6.6d", fl[oind].lcnt);
 				}
 				db_printf("\n");
 			}
 			db_printf("\n");
 		}
 		db_printf("\n");
 	}
 }
 #endif
Index: head/sys/vm/vm_phys.h
===================================================================
--- head/sys/vm/vm_phys.h	(revision 350658)
+++ head/sys/vm/vm_phys.h	(revision 350659)
@@ -1,128 +1,129 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  *	Physical memory system definitions
  */
 
 #ifndef	_VM_PHYS_H_
 #define	_VM_PHYS_H_
 
 #ifdef _KERNEL
 
 #ifndef VM_NFREEORDER_MAX
 #define	VM_NFREEORDER_MAX	VM_NFREEORDER
 #endif
 
 /* Domains must be dense (non-sparse) and zero-based. */
 struct mem_affinity {
 	vm_paddr_t start;
 	vm_paddr_t end;
 	int domain;
 };
 #ifdef NUMA
 extern struct mem_affinity *mem_affinity;
 extern int *mem_locality;
 #endif
 
 struct vm_freelist {
 	struct pglist pl;
 	int lcnt;
 };
 
 struct vm_phys_seg {
 	vm_paddr_t	start;
 	vm_paddr_t	end;
 	vm_page_t	first_page;
 	int		domain;
 	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
 };
 
 extern struct vm_phys_seg vm_phys_segs[];
 extern int vm_phys_nsegs;
 
 /*
  * The following functions are only to be used by the virtual memory system.
  */
 void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end);
 vm_page_t vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 vm_page_t vm_phys_alloc_freelist_pages(int domain, int freelist, int pool,
     int order);
 int vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]);
 vm_page_t vm_phys_alloc_pages(int domain, int pool, int order);
 int vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high);
 void vm_phys_enqueue_contig(vm_page_t m, u_long npages);
 int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
     vm_memattr_t memattr);
 void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end);
 vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa);
 void vm_phys_free_contig(vm_page_t m, u_long npages);
 void vm_phys_free_pages(vm_page_t m, int order);
 void vm_phys_init(void);
 vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
 void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
     int *locality);
 vm_page_t vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options);
 void vm_phys_set_pool(int pool, vm_page_t m, int order);
 boolean_t vm_phys_unfree_page(vm_page_t m);
 int vm_phys_mem_affinity(int f, int t);
 
 /*
  *
  *	vm_phys_domain:
  *
  *	Return the index of the domain the page belongs to.
  */
 static inline int
 vm_phys_domain(vm_page_t m)
 {
 #ifdef NUMA
 	int domn, segind;
 
 	/* XXXKIB try to assert that the page is managed */
 	segind = m->segind;
 	KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m));
 	domn = vm_phys_segs[segind].domain;
 	KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m));
 	return (domn);
 #else
 	return (0);
 #endif
 }
+int _vm_phys_domain(vm_paddr_t pa);
 
 #endif	/* _KERNEL */
 #endif	/* !_VM_PHYS_H_ */
Index: head/usr.bin/vmstat/vmstat.c
===================================================================
--- head/usr.bin/vmstat/vmstat.c	(revision 350658)
+++ head/usr.bin/vmstat/vmstat.c	(revision 350659)
@@ -1,1691 +1,1692 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1980, 1986, 1991, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /* not lint */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)vmstat.c	8.1 (Berkeley) 6/6/93";
 #endif /* not lint */
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/signal.h>
 #include <sys/fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/resource.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/user.h>
 #define	_WANT_VMMETER
 #include <sys/vmmeter.h>
 #include <sys/pcpu.h>
 
 #include <vm/vm_param.h>
 
 #include <ctype.h>
 #include <devstat.h>
 #include <err.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <kvm.h>
 #include <limits.h>
 #include <memstat.h>
 #include <nlist.h>
 #include <paths.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sysexits.h>
 #include <time.h>
 #include <unistd.h>
 #include <libutil.h>
 #include <libxo/xo.h>
 
 #define VMSTAT_XO_VERSION "1"
 
 static char da[] = "da";
 
 enum x_stats { X_SUM, X_HZ, X_STATHZ, X_NCHSTATS, X_INTRNAMES, X_SINTRNAMES,
     X_INTRCNT, X_SINTRCNT, X_NINTRCNT };
 
 static struct nlist namelist[] = {
 	[X_SUM] = { .n_name = "_vm_cnt", },
 	[X_HZ] = { .n_name = "_hz", },
 	[X_STATHZ] = { .n_name = "_stathz", },
 	[X_NCHSTATS] = { .n_name = "_nchstats", },
 	[X_INTRNAMES] = { .n_name = "_intrnames", },
 	[X_SINTRNAMES] = { .n_name = "_sintrnames", },
 	[X_INTRCNT] = { .n_name = "_intrcnt", },
 	[X_SINTRCNT] = { .n_name = "_sintrcnt", },
 	[X_NINTRCNT] = { .n_name = "_nintrcnt", },
 	{ .n_name = NULL, },
 };
 
 static struct devstat_match *matches;
 static struct device_selection *dev_select;
 static struct statinfo cur, last;
 static devstat_select_mode select_mode;
 static size_t size_cp_times;
 static long *cur_cp_times, *last_cp_times;
 static long generation, select_generation;
 static int hz, hdrcnt, maxshowdevs;
 static int num_devices, num_devices_specified;
 static int num_matches, num_selected, num_selections;
 static char **specified_devices;
 
 static struct __vmmeter {
 	uint64_t v_swtch;
 	uint64_t v_trap;
 	uint64_t v_syscall;
 	uint64_t v_intr;
 	uint64_t v_soft;
 	uint64_t v_vm_faults;
 	uint64_t v_io_faults;
 	uint64_t v_cow_faults;
 	uint64_t v_cow_optim;
 	uint64_t v_zfod;
 	uint64_t v_ozfod;
 	uint64_t v_swapin;
 	uint64_t v_swapout;
 	uint64_t v_swappgsin;
 	uint64_t v_swappgsout;
 	uint64_t v_vnodein;
 	uint64_t v_vnodeout;
 	uint64_t v_vnodepgsin;
 	uint64_t v_vnodepgsout;
 	uint64_t v_intrans;
 	uint64_t v_reactivated;
 	uint64_t v_pdwakeups;
 	uint64_t v_pdpages;
 	uint64_t v_pdshortfalls;
 	uint64_t v_dfree;
 	uint64_t v_pfree;
 	uint64_t v_tfree;
 	uint64_t v_forks;
 	uint64_t v_vforks;
 	uint64_t v_rforks;
 	uint64_t v_kthreads;
 	uint64_t v_forkpages;
 	uint64_t v_vforkpages;
 	uint64_t v_rforkpages;
 	uint64_t v_kthreadpages;
 	u_int v_page_size;
 	u_int v_page_count;
 	u_int v_free_reserved;
 	u_int v_free_target;
 	u_int v_free_min;
 	u_int v_free_count;
 	u_int v_wire_count;
 	u_long v_user_wire_count;
 	u_int v_active_count;
 	u_int v_inactive_target;
 	u_int v_inactive_count;
 	u_int v_laundry_count;
 	u_int v_pageout_free_min;
 	u_int v_interrupt_free_min;
 	u_int v_free_severe;
 } sum, osum;
 
 #define	VMSTAT_DEFAULT_LINES	20	/* Default number of `winlines'. */
 static volatile sig_atomic_t wresized;		/* Tty resized when non-zero. */
 static int winlines = VMSTAT_DEFAULT_LINES; /* Current number of tty rows. */
 
 static int	aflag;
 static int	nflag;
 static int	Pflag;
 static int	hflag;
 
 static kvm_t	*kd;
 
 #define	FORKSTAT	0x01
 #define	INTRSTAT	0x02
 #define	MEMSTAT		0x04
 #define	SUMSTAT		0x08
 #define	TIMESTAT	0x10
 #define	VMSTAT		0x20
 #define	ZMEMSTAT	0x40
 #define	OBJSTAT		0x80
 
 static void	cpustats(void);
 static void	pcpustats(u_long, int);
 static void	devstats(void);
 static void	doforkst(void);
 static void	dointr(unsigned int, int);
 static void	doobjstat(void);
 static void	dosum(void);
 static void	dovmstat(unsigned int, int);
 static void	domemstat_malloc(void);
 static void	domemstat_zone(void);
 static void	kread(int, void *, size_t);
 static void	kreado(int, void *, size_t, size_t);
 static void	kreadptr(uintptr_t, void *, size_t);
 static void	needhdr(int);
 static void	needresize(int);
 static void	doresize(void);
 static void	printhdr(int, u_long);
 static void	usage(void);
 
 static long	pct(long, long);
 static long long	getuptime(void);
 
 static char	**getdrivedata(char **);
 
 int
 main(int argc, char *argv[])
 {
 	char *bp, *buf, *memf, *nlistf;
 	float f;
 	int bufsize, c, reps, todo;
 	size_t len;
 	unsigned int interval;
 	char errbuf[_POSIX2_LINE_MAX];
 
 	memf = nlistf = NULL;
 	interval = reps = todo = 0;
 	maxshowdevs = 2;
 	hflag = isatty(1);
 
 	argc = xo_parse_args(argc, argv);
 	if (argc < 0)
 		return (argc);
 
 	while ((c = getopt(argc, argv, "ac:fhHiM:mN:n:oPp:sw:z")) != -1) {
 		switch (c) {
 		case 'a':
 			aflag++;
 			break;
 		case 'c':
 			reps = atoi(optarg);
 			break;
 		case 'P':
 			Pflag++;
 			break;
 		case 'f':
 			todo |= FORKSTAT;
 			break;
 		case 'h':
 			hflag = 1;
 			break;
 		case 'H':
 			hflag = 0;
 			break;
 		case 'i':
 			todo |= INTRSTAT;
 			break;
 		case 'M':
 			memf = optarg;
 			break;
 		case 'm':
 			todo |= MEMSTAT;
 			break;
 		case 'N':
 			nlistf = optarg;
 			break;
 		case 'n':
 			nflag = 1;
 			maxshowdevs = atoi(optarg);
 			if (maxshowdevs < 0)
 				xo_errx(1, "number of devices %d is < 0",
 				    maxshowdevs);
 			break;
 		case 'o':
 			todo |= OBJSTAT;
 			break;
 		case 'p':
 			if (devstat_buildmatch(optarg, &matches, &num_matches)
 			    != 0)
 				xo_errx(1, "%s", devstat_errbuf);
 			break;
 		case 's':
 			todo |= SUMSTAT;
 			break;
 		case 'w':
 			/* Convert to milliseconds. */
 			f = atof(optarg);
 			interval = f * 1000;
 			break;
 		case 'z':
 			todo |= ZMEMSTAT;
 			break;
 		case '?':
 		default:
 			usage();
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	xo_set_version(VMSTAT_XO_VERSION);
 	if (todo == 0)
 		todo = VMSTAT;
 
 	if (memf != NULL) {
 		kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf);
 		if (kd == NULL)
 			xo_errx(1, "kvm_openfiles: %s", errbuf);
 	}
 
 retry_nlist:
 	if (kd != NULL && (c = kvm_nlist(kd, namelist)) != 0) {
 		if (c > 0) {
 			bufsize = 0;
 			len = 0;
 
 			/*
 			 * 'cnt' was renamed to 'vm_cnt'.  If 'vm_cnt' is not
 			 * found try looking up older 'cnt' symbol.
 			 * */
 			if (namelist[X_SUM].n_type == 0 &&
 			    strcmp(namelist[X_SUM].n_name, "_vm_cnt") == 0) {
 				namelist[X_SUM].n_name = "_cnt";
 				goto retry_nlist;
 			}
 
 			/*
 			 * 'nintrcnt' doesn't exist in older kernels, but
 			 * that isn't fatal.
 			 */
 			if (namelist[X_NINTRCNT].n_type == 0 && c == 1)
 				goto nlist_ok;
 
 			for (c = 0; c < (int)(nitems(namelist)); c++)
 				if (namelist[c].n_type == 0)
 					bufsize += strlen(namelist[c].n_name)
 					    + 1;
 			bufsize += len + 1;
 			buf = bp = alloca(bufsize);
 
 			for (c = 0; c < (int)(nitems(namelist)); c++)
 				if (namelist[c].n_type == 0) {
 					xo_error(" %s",
 					    namelist[c].n_name);
 					len = strlen(namelist[c].n_name);
 					*bp++ = ' ';
 					memcpy(bp, namelist[c].n_name, len);
 					bp += len;
 				}
 			*bp = '\0';
 			xo_error("undefined symbols:\n", buf);
 		} else
 			xo_warnx("kvm_nlist: %s", kvm_geterr(kd));
 		xo_finish();
 		exit(1);
 	}
 nlist_ok:
 	if (kd && Pflag)
 		xo_errx(1, "Cannot use -P with crash dumps");
 
 	if (todo & VMSTAT) {
 		/*
 		 * Make sure that the userland devstat version matches the
 		 * kernel devstat version.  If not, exit and print a
 		 * message informing the user of his mistake.
 		 */
 		if (devstat_checkversion(NULL) < 0)
 			xo_errx(1, "%s", devstat_errbuf);
 
 
 		argv = getdrivedata(argv);
 	}
 
 	if (*argv) {
 		f = atof(*argv);
 		interval = f * 1000;
 		if (*++argv)
 			reps = atoi(*argv);
 	}
 
 	if (interval) {
 		if (!reps)
 			reps = -1;
 	} else if (reps)
 		interval = 1 * 1000;
 
 	if (todo & FORKSTAT)
 		doforkst();
 	if (todo & MEMSTAT)
 		domemstat_malloc();
 	if (todo & ZMEMSTAT)
 		domemstat_zone();
 	if (todo & SUMSTAT)
 		dosum();
 	if (todo & OBJSTAT)
 		doobjstat();
 	if (todo & INTRSTAT)
 		dointr(interval, reps);
 	if (todo & VMSTAT)
 		dovmstat(interval, reps);
 	xo_finish();
 	exit(0);
 }
 
 static int
 mysysctl(const char *name, void *oldp, size_t *oldlenp)
 {
 	int error;
 
 	error = sysctlbyname(name, oldp, oldlenp, NULL, 0);
 	if (error != 0 && errno != ENOMEM)
 		xo_err(1, "sysctl(%s)", name);
 	return (error);
 }
 
 static char **
 getdrivedata(char **argv)
 {
 
 	if ((num_devices = devstat_getnumdevs(NULL)) < 0)
 		xo_errx(1, "%s", devstat_errbuf);
 
 	cur.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo));
 	last.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo));
 
 	if (devstat_getdevs(NULL, &cur) == -1)
 		xo_errx(1, "%s", devstat_errbuf);
 
 	num_devices = cur.dinfo->numdevs;
 	generation = cur.dinfo->generation;
 
 	specified_devices = malloc(sizeof(char *));
 	for (num_devices_specified = 0; *argv; ++argv) {
 		if (isdigit(**argv))
 			break;
 		num_devices_specified++;
 		specified_devices = reallocf(specified_devices,
 		    sizeof(char *) * num_devices_specified);
 		if (specified_devices == NULL) {
 			xo_errx(1, "%s", "reallocf (specified_devices)");
 		}
 		specified_devices[num_devices_specified - 1] = *argv;
 	}
 	dev_select = NULL;
 
 	if (nflag == 0 && maxshowdevs < num_devices_specified)
 		maxshowdevs = num_devices_specified;
 
 	/*
 	 * People are generally only interested in disk statistics when
 	 * they're running vmstat.  So, that's what we're going to give
 	 * them if they don't specify anything by default.  We'll also give
 	 * them any other random devices in the system so that we get to
 	 * maxshowdevs devices, if that many devices exist.  If the user
 	 * specifies devices on the command line, either through a pattern
 	 * match or by naming them explicitly, we will give the user only
 	 * those devices.
 	 */
 	if ((num_devices_specified == 0) && (num_matches == 0)) {
 		if (devstat_buildmatch(da, &matches, &num_matches) != 0)
 			xo_errx(1, "%s", devstat_errbuf);
 		select_mode = DS_SELECT_ADD;
 	} else
 		select_mode = DS_SELECT_ONLY;
 
 	/*
 	 * At this point, selectdevs will almost surely indicate that the
 	 * device list has changed, so we don't look for return values of 0
 	 * or 1.  If we get back -1, though, there is an error.
 	 */
 	if (devstat_selectdevs(&dev_select, &num_selected, &num_selections,
 	    &select_generation, generation, cur.dinfo->devices,
 	    num_devices, matches, num_matches, specified_devices,
 	    num_devices_specified, select_mode,
 	    maxshowdevs, 0) == -1)
 		xo_errx(1, "%s", devstat_errbuf);
 
 	return(argv);
 }
 
 /* Return system uptime in nanoseconds */
 static long long
 getuptime(void)
 {
 	struct timespec sp;
 
 	(void)clock_gettime(CLOCK_UPTIME, &sp);
 	return((long long)sp.tv_sec * 1000000000LL + sp.tv_nsec);
 }
 
 static void
 fill_vmmeter(struct __vmmeter *vmmp)
 {
 	struct vmmeter vm_cnt;
 	size_t size;
 
 	if (kd != NULL) {
 		kread(X_SUM, &vm_cnt, sizeof(vm_cnt));
 #define	GET_COUNTER(name) \
 		vmmp->name = kvm_counter_u64_fetch(kd, (u_long)vm_cnt.name)
 		GET_COUNTER(v_swtch);
 		GET_COUNTER(v_trap);
 		GET_COUNTER(v_syscall);
 		GET_COUNTER(v_intr);
 		GET_COUNTER(v_soft);
 		GET_COUNTER(v_vm_faults);
 		GET_COUNTER(v_io_faults);
 		GET_COUNTER(v_cow_faults);
 		GET_COUNTER(v_cow_optim);
 		GET_COUNTER(v_zfod);
 		GET_COUNTER(v_ozfod);
 		GET_COUNTER(v_swapin);
 		GET_COUNTER(v_swapout);
 		GET_COUNTER(v_swappgsin);
 		GET_COUNTER(v_swappgsout);
 		GET_COUNTER(v_vnodein);
 		GET_COUNTER(v_vnodeout);
 		GET_COUNTER(v_vnodepgsin);
 		GET_COUNTER(v_vnodepgsout);
 		GET_COUNTER(v_intrans);
 		GET_COUNTER(v_tfree);
 		GET_COUNTER(v_forks);
 		GET_COUNTER(v_vforks);
 		GET_COUNTER(v_rforks);
 		GET_COUNTER(v_kthreads);
 		GET_COUNTER(v_forkpages);
 		GET_COUNTER(v_vforkpages);
 		GET_COUNTER(v_rforkpages);
 		GET_COUNTER(v_kthreadpages);
 #undef GET_COUNTER
 	} else {
 #define GET_VM_STATS(cat, name)	do {					\
 	size = sizeof(vmmp->name);					\
 	mysysctl("vm.stats." #cat "." #name, &vmmp->name, &size);	\
 } while (0)
 		/* sys */
 		GET_VM_STATS(sys, v_swtch);
 		GET_VM_STATS(sys, v_trap);
 		GET_VM_STATS(sys, v_syscall);
 		GET_VM_STATS(sys, v_intr);
 		GET_VM_STATS(sys, v_soft);
 
 		/* vm */
 		GET_VM_STATS(vm, v_vm_faults);
 		GET_VM_STATS(vm, v_io_faults);
 		GET_VM_STATS(vm, v_cow_faults);
 		GET_VM_STATS(vm, v_cow_optim);
 		GET_VM_STATS(vm, v_zfod);
 		GET_VM_STATS(vm, v_ozfod);
 		GET_VM_STATS(vm, v_swapin);
 		GET_VM_STATS(vm, v_swapout);
 		GET_VM_STATS(vm, v_swappgsin);
 		GET_VM_STATS(vm, v_swappgsout);
 		GET_VM_STATS(vm, v_vnodein);
 		GET_VM_STATS(vm, v_vnodeout);
 		GET_VM_STATS(vm, v_vnodepgsin);
 		GET_VM_STATS(vm, v_vnodepgsout);
 		GET_VM_STATS(vm, v_intrans);
 		GET_VM_STATS(vm, v_reactivated);
 		GET_VM_STATS(vm, v_pdwakeups);
 		GET_VM_STATS(vm, v_pdpages);
 		GET_VM_STATS(vm, v_pdshortfalls);
 		GET_VM_STATS(vm, v_dfree);
 		GET_VM_STATS(vm, v_pfree);
 		GET_VM_STATS(vm, v_tfree);
 		GET_VM_STATS(vm, v_page_size);
 		GET_VM_STATS(vm, v_page_count);
 		GET_VM_STATS(vm, v_free_reserved);
 		GET_VM_STATS(vm, v_free_target);
 		GET_VM_STATS(vm, v_free_min);
 		GET_VM_STATS(vm, v_free_count);
 		GET_VM_STATS(vm, v_wire_count);
 		GET_VM_STATS(vm, v_user_wire_count);
 		GET_VM_STATS(vm, v_active_count);
 		GET_VM_STATS(vm, v_inactive_target);
 		GET_VM_STATS(vm, v_inactive_count);
 		GET_VM_STATS(vm, v_laundry_count);
 		GET_VM_STATS(vm, v_pageout_free_min);
 		GET_VM_STATS(vm, v_interrupt_free_min);
 		/*GET_VM_STATS(vm, v_free_severe);*/
 		GET_VM_STATS(vm, v_forks);
 		GET_VM_STATS(vm, v_vforks);
 		GET_VM_STATS(vm, v_rforks);
 		GET_VM_STATS(vm, v_kthreads);
 		GET_VM_STATS(vm, v_forkpages);
 		GET_VM_STATS(vm, v_vforkpages);
 		GET_VM_STATS(vm, v_rforkpages);
 		GET_VM_STATS(vm, v_kthreadpages);
 #undef GET_VM_STATS
 	}
 }
 
 static void
 fill_vmtotal(struct vmtotal *vmtp)
 {
 	size_t size;
 
 	if (kd != NULL) {
 		/* XXX fill vmtp */
 		xo_errx(1, "not implemented");
 	} else {
 		size = sizeof(*vmtp);
 		mysysctl("vm.vmtotal", vmtp, &size);
 		if (size != sizeof(*vmtp))
 			xo_errx(1, "vm.total size mismatch");
 	}
 }
 
 /* Determine how many cpu columns, and what index they are in kern.cp_times */
 static int
 getcpuinfo(u_long *maskp, int *maxidp)
 {
 	long *times;
 	u_long mask;
 	size_t size;
 	int empty, i, j, maxcpu, maxid, ncpus;
 
 	if (kd != NULL)
 		xo_errx(1, "not implemented");
 	mask = 0;
 	ncpus = 0;
 	size = sizeof(maxcpu);
 	mysysctl("kern.smp.maxcpus", &maxcpu, &size);
 	if (size != sizeof(maxcpu))
 		xo_errx(1, "sysctl kern.smp.maxcpus");
 	size = sizeof(long) * maxcpu * CPUSTATES;
 	times = malloc(size);
 	if (times == NULL)
 		xo_err(1, "malloc %zd bytes", size);
 	mysysctl("kern.cp_times", times, &size);
 	maxid = (size / CPUSTATES / sizeof(long)) - 1;
 	for (i = 0; i <= maxid; i++) {
 		empty = 1;
 		for (j = 0; empty && j < CPUSTATES; j++) {
 			if (times[i * CPUSTATES + j] != 0)
 				empty = 0;
 		}
 		if (!empty) {
 			mask |= (1ul << i);
 			ncpus++;
 		}
 	}
 	if (maskp)
 		*maskp = mask;
 	if (maxidp)
 		*maxidp = maxid;
 	return (ncpus);
 }
 
 
 static void
 prthuman(const char *name, uint64_t val, int size)
 {
 	int flags;
 	char buf[10];
 	char fmt[128];
 
 	snprintf(fmt, sizeof(fmt), "{:%s/%%*s}", name);
 
 	if (size < 5 || size > 9)
 		xo_errx(1, "doofus");
 	flags = HN_B | HN_NOSPACE | HN_DECIMAL;
 	humanize_number(buf, size, val, "", HN_AUTOSCALE, flags);
 	xo_attr("value", "%ju", (uintmax_t) val);
 	xo_emit(fmt, size, buf);
 }
 
 static void
 dovmstat(unsigned int interval, int reps)
 {
 	struct clockinfo clockrate;
 	struct vmtotal total;
 	struct devinfo *tmp_dinfo;
 	u_long cpumask;
 	size_t size;
 	time_t uptime, halfuptime;
 	int ncpus, maxid, rate_adj, retval;
 
 	uptime = getuptime() / 1000000000LL;
 	halfuptime = uptime / 2;
 	rate_adj = 1;
 	ncpus = 1;
 	maxid = 0;
 	cpumask = 0;
 
 	/*
 	 * If the user stops the program (control-Z) and then resumes it,
 	 * print out the header again.
 	 */
 	(void)signal(SIGCONT, needhdr);
 
 	/*
 	 * If our standard output is a tty, then install a SIGWINCH handler
 	 * and set wresized so that our first iteration through the main
 	 * vmstat loop will peek at the terminal's current rows to find out
 	 * how many lines can fit in a screenful of output.
 	 */
 	if (isatty(fileno(stdout)) != 0) {
 		wresized = 1;
 		(void)signal(SIGWINCH, needresize);
 	} else {
 		wresized = 0;
 		winlines = VMSTAT_DEFAULT_LINES;
 	}
 
 	if (kd != NULL) {
 		if (namelist[X_STATHZ].n_type != 0 &&
 		    namelist[X_STATHZ].n_value != 0)
 			kread(X_STATHZ, &hz, sizeof(hz));
 		if (!hz)
 			kread(X_HZ, &hz, sizeof(hz));
 	} else {
 		size = sizeof(clockrate);
 		mysysctl("kern.clockrate", &clockrate, &size);
 		if (size != sizeof(clockrate))
 			xo_errx(1, "clockrate size mismatch");
 		hz = clockrate.hz;
 	}
 
 	if (Pflag) {
 		ncpus = getcpuinfo(&cpumask, &maxid);
 		size_cp_times = sizeof(long) * (maxid + 1) * CPUSTATES;
 		cur_cp_times = calloc(1, size_cp_times);
 		last_cp_times = calloc(1, size_cp_times);
 	}
 	for (hdrcnt = 1;;) {
 		if (!--hdrcnt)
 			printhdr(maxid, cpumask);
 		if (kd != NULL) {
 			if (kvm_getcptime(kd, cur.cp_time) < 0)
 				xo_errx(1, "kvm_getcptime: %s", kvm_geterr(kd));
 		} else {
 			size = sizeof(cur.cp_time);
 			mysysctl("kern.cp_time", &cur.cp_time, &size);
 			if (size != sizeof(cur.cp_time))
 				xo_errx(1, "cp_time size mismatch");
 		}
 		if (Pflag) {
 			size = size_cp_times;
 			mysysctl("kern.cp_times", cur_cp_times, &size);
 			if (size != size_cp_times)
 				xo_errx(1, "cp_times mismatch");
 		}
 
 		tmp_dinfo = last.dinfo;
 		last.dinfo = cur.dinfo;
 		cur.dinfo = tmp_dinfo;
 		last.snap_time = cur.snap_time;
 
 		/*
 		 * Here what we want to do is refresh our device stats.
 		 * getdevs() returns 1 when the device list has changed.
 		 * If the device list has changed, we want to go through
 		 * the selection process again, in case a device that we
 		 * were previously displaying has gone away.
 		 */
 		switch (devstat_getdevs(NULL, &cur)) {
 		case -1:
 			xo_errx(1, "%s", devstat_errbuf);
 			break;
 		case 1:
 			num_devices = cur.dinfo->numdevs;
 			generation = cur.dinfo->generation;
 
 			retval = devstat_selectdevs(&dev_select, &num_selected,
 			    &num_selections, &select_generation,
 			    generation, cur.dinfo->devices,
 			    num_devices, matches, num_matches,
 			    specified_devices,
 			    num_devices_specified, select_mode,
 			    maxshowdevs, 0);
 			switch (retval) {
 			case -1:
 				xo_errx(1, "%s", devstat_errbuf);
 				break;
 			case 1:
 				printhdr(maxid, cpumask);
 				break;
 			default:
 				break;
 			}
 			break;
 		default:
 			break;
 		}
 
 		fill_vmmeter(&sum);
 		fill_vmtotal(&total);
 		xo_open_container("processes");
 		xo_emit("{:runnable/%1d} {:waiting/%ld} "
 		    "{:swapped-out/%ld}", total.t_rq - 1, total.t_dw +
 		    total.t_pw, total.t_sw);
 		xo_close_container("processes");
 		xo_open_container("memory");
 #define vmstat_pgtok(a) ((uintmax_t)(a) * (sum.v_page_size >> 10))
 #define	rate(x)	(((x) * rate_adj + halfuptime) / uptime)	/* round */
 		if (hflag) {
 			xo_emit("");
 			prthuman("available-memory",
 			    total.t_avm * (uint64_t)sum.v_page_size, 5);
 			xo_emit(" ");
 			prthuman("free-memory",
 			    total.t_free * (uint64_t)sum.v_page_size, 5);
 			xo_emit(" ");
 		} else {
 			xo_emit(" ");
 			xo_emit("{:available-memory/%7ju}",
 			    vmstat_pgtok(total.t_avm));
 			xo_emit(" ");
 			xo_emit("{:free-memory/%7ju}",
 			    vmstat_pgtok(total.t_free));
 			xo_emit(" ");
 		}
 		xo_emit("{:total-page-faults/%5lu} ",
 		    (unsigned long)rate(sum.v_vm_faults -
 		    osum.v_vm_faults));
 		xo_close_container("memory");
 
 		xo_open_container("paging-rates");
 		xo_emit("{:page-reactivated/%3lu} ",
 		    (unsigned long)rate(sum.v_reactivated -
 		    osum.v_reactivated));
 		xo_emit("{:paged-in/%3lu} ",
 		    (unsigned long)rate(sum.v_swapin + sum.v_vnodein -
 		    (osum.v_swapin + osum.v_vnodein)));
 		xo_emit("{:paged-out/%3lu} ",
 		    (unsigned long)rate(sum.v_swapout + sum.v_vnodeout -
 		    (osum.v_swapout + osum.v_vnodeout)));
 		xo_emit("{:freed/%5lu} ",
 		    (unsigned long)rate(sum.v_tfree - osum.v_tfree));
 		xo_emit("{:scanned/%4lu} ",
 		    (unsigned long)rate(sum.v_pdpages - osum.v_pdpages));
 		xo_close_container("paging-rates");
 
 		devstats();
 		xo_open_container("fault-rates");
 		xo_emit("{:interrupts/%4lu} {:system-calls/%5lu} "
 		    "{:context-switches/%5lu}",
 		    (unsigned long)rate(sum.v_intr - osum.v_intr),
 		    (unsigned long)rate(sum.v_syscall - osum.v_syscall),
 		    (unsigned long)rate(sum.v_swtch - osum.v_swtch));
 		xo_close_container("fault-rates");
 		if (Pflag)
 			pcpustats(cpumask, maxid);
 		else
 			cpustats();
 		xo_emit("\n");
 		xo_flush();
 		if (reps >= 0 && --reps <= 0)
 			break;
 		osum = sum;
 		uptime = interval;
 		rate_adj = 1000;
 		/*
 		 * We round upward to avoid losing low-frequency events
 		 * (i.e., >= 1 per interval but < 1 per millisecond).
 		 */
 		if (interval != 1)
 			halfuptime = (uptime + 1) / 2;
 		else
 			halfuptime = 0;
 		(void)usleep(interval * 1000);
 	}
 }
 
 static void
 printhdr(int maxid, u_long cpumask)
 {
 	int i, num_shown;
 
 	num_shown = MIN(num_selected, maxshowdevs);
 	if (hflag)
 		xo_emit("{T:procs}  {T:memory}       {T:/page%*s}", 19, "");
 	else
 		xo_emit("{T:procs}     {T:memory}        {T:/page%*s}", 19, "");
 	if (num_shown > 1)
 		xo_emit(" {T:/disks %*s}", num_shown * 4 - 7, "");
 	else if (num_shown == 1)
 		xo_emit("   {T:disks}");
 	xo_emit("   {T:faults}      ");
 	if (Pflag) {
 		for (i = 0; i <= maxid; i++) {
 			if (cpumask & (1ul << i))
 				xo_emit("  {T:/cpu%d}   ", i);
 		}
 		xo_emit("\n");
 	} else
 		xo_emit("   {T:cpu}\n");
 	if (hflag) {
 		xo_emit("{T:r} {T:b} {T:w}  {T:avm}   {T:fre}   {T:flt}  {T:re}"
 		    "  {T:pi}  {T:po}    {T:fr}   {T:sr} ");
 	} else {
 		xo_emit("{T:r} {T:b} {T:w}     {T:avm}     {T:fre}  {T:flt}  "
 		    "{T:re}  {T:pi}  {T:po}    {T:fr}   {T:sr} ");
 	}
 	for (i = 0; i < num_devices; i++)
 		if ((dev_select[i].selected) &&
 		    (dev_select[i].selected <= maxshowdevs))
 			xo_emit("{T:/%c%c%d} ", dev_select[i].device_name[0],
 			    dev_select[i].device_name[1],
 			    dev_select[i].unit_number);
 	xo_emit("  {T:in}    {T:sy}    {T:cs}");
 	if (Pflag) {
 		for (i = 0; i <= maxid; i++) {
 			if (cpumask & (1ul << i))
 				xo_emit(" {T:us} {T:sy} {T:id}");
 		}
 		xo_emit("\n");
 	} else
 		xo_emit(" {T:us} {T:sy} {T:id}\n");
 	if (wresized != 0)
 		doresize();
 	hdrcnt = winlines;
 }
 
 /*
  * Force a header to be prepended to the next output.
  */
 static void
 needhdr(int dummy __unused)
 {
 
 	hdrcnt = 1;
 }
 
 /*
  * When the terminal is resized, force an update of the maximum number of rows
  * printed between each header repetition.  Then force a new header to be
  * prepended to the next output.
  */
 void
 needresize(int signo __unused)
 {
 
 	wresized = 1;
 	hdrcnt = 1;
 }
 
 /*
  * Update the global `winlines' count of terminal rows.
  */
 void
 doresize(void)
 {
 	struct winsize w;
 	int status;
 
 	for (;;) {
 		status = ioctl(fileno(stdout), TIOCGWINSZ, &w);
 		if (status == -1 && errno == EINTR)
 			continue;
 		else if (status == -1)
 			xo_err(1, "ioctl");
 		if (w.ws_row > 3)
 			winlines = w.ws_row - 3;
 		else
 			winlines = VMSTAT_DEFAULT_LINES;
 		break;
 	}
 
 	/*
 	 * Inhibit doresize() calls until we are rescheduled by SIGWINCH.
 	 */
 	wresized = 0;
 }
 
 static long
 pct(long top, long bot)
 {
 	long ans;
 
 	if (bot == 0)
 		return(0);
 	ans = (quad_t)top * 100 / bot;
 	return (ans);
 }
 
 #define	PCT(top, bot) pct((long)(top), (long)(bot))
 
 static void
 dosum(void)
 {
 	struct nchstats lnchstats;
 	size_t size;
 	long nchtotal;
 
 	fill_vmmeter(&sum);
 	xo_open_container("summary-statistics");
 	xo_emit("{:context-switches/%9u} {N:cpu context switches}\n",
 	    sum.v_swtch);
 	xo_emit("{:interrupts/%9u} {N:device interrupts}\n",
 	    sum.v_intr);
 	xo_emit("{:software-interrupts/%9u} {N:software interrupts}\n",
 	    sum.v_soft);
 	xo_emit("{:traps/%9u} {N:traps}\n", sum.v_trap);
 	xo_emit("{:system-calls/%9u} {N:system calls}\n",
 	    sum.v_syscall);
 	xo_emit("{:kernel-threads/%9u} {N:kernel threads created}\n",
 	    sum.v_kthreads);
 	xo_emit("{:forks/%9u} {N: fork() calls}\n", sum.v_forks);
 	xo_emit("{:vforks/%9u} {N:vfork() calls}\n",
 	    sum.v_vforks);
 	xo_emit("{:rforks/%9u} {N:rfork() calls}\n",
 	    sum.v_rforks);
 	xo_emit("{:swap-ins/%9u} {N:swap pager pageins}\n",
 	    sum.v_swapin);
 	xo_emit("{:swap-in-pages/%9u} {N:swap pager pages paged in}\n",
 	    sum.v_swappgsin);
 	xo_emit("{:swap-outs/%9u} {N:swap pager pageouts}\n",
 	    sum.v_swapout);
 	xo_emit("{:swap-out-pages/%9u} {N:swap pager pages paged out}\n",
 	    sum.v_swappgsout);
 	xo_emit("{:vnode-page-ins/%9u} {N:vnode pager pageins}\n",
 	    sum.v_vnodein);
 	xo_emit("{:vnode-page-in-pages/%9u} {N:vnode pager pages paged in}\n",
 	    sum.v_vnodepgsin);
 	xo_emit("{:vnode-page-outs/%9u} {N:vnode pager pageouts}\n",
 	    sum.v_vnodeout);
 	xo_emit("{:vnode-page-out-pages/%9u} {N:vnode pager pages paged out}\n",
 	    sum.v_vnodepgsout);
 	xo_emit("{:page-daemon-wakeups/%9u} {N:page daemon wakeups}\n",
 	    sum.v_pdwakeups);
 	xo_emit("{:page-daemon-pages/%9u} {N:pages examined by the page "
 	    "daemon}\n", sum.v_pdpages);
 	xo_emit("{:page-reclamation-shortfalls/%9u} {N:clean page reclamation "
 	    "shortfalls}\n", sum.v_pdshortfalls);
 	xo_emit("{:reactivated/%9u} {N:pages reactivated by the page daemon}\n",
 	    sum.v_reactivated);
 	xo_emit("{:copy-on-write-faults/%9u} {N:copy-on-write faults}\n",
 	    sum.v_cow_faults);
 	xo_emit("{:copy-on-write-optimized-faults/%9u} {N:copy-on-write "
 	    "optimized faults}\n", sum.v_cow_optim);
 	xo_emit("{:zero-fill-pages/%9u} {N:zero fill pages zeroed}\n",
 	    sum.v_zfod);
 	xo_emit("{:zero-fill-prezeroed/%9u} {N:zero fill pages prezeroed}\n",
 	    sum.v_ozfod);
 	xo_emit("{:intransit-blocking/%9u} {N:intransit blocking page faults}\n",
 	    sum.v_intrans);
 	xo_emit("{:total-faults/%9u} {N:total VM faults taken}\n",
 	    sum.v_vm_faults);
 	xo_emit("{:faults-requiring-io/%9u} {N:page faults requiring I\\/O}\n",
 	    sum.v_io_faults);
 	xo_emit("{:faults-from-thread-creation/%9u} {N:pages affected by "
 	    "kernel thread creation}\n", sum.v_kthreadpages);
 	xo_emit("{:faults-from-fork/%9u} {N:pages affected by  fork}()\n",
 	    sum.v_forkpages);
 	xo_emit("{:faults-from-vfork/%9u} {N:pages affected by vfork}()\n",
 	    sum.v_vforkpages);
 	xo_emit("{:pages-rfork/%9u} {N:pages affected by rfork}()\n",
 	    sum.v_rforkpages);
 	xo_emit("{:pages-freed/%9u} {N:pages freed}\n",
 	    sum.v_tfree);
 	xo_emit("{:pages-freed-by-daemon/%9u} {N:pages freed by daemon}\n",
 	    sum.v_dfree);
 	xo_emit("{:pages-freed-on-exit/%9u} {N:pages freed by exiting processes}\n",
 	    sum.v_pfree);
 	xo_emit("{:active-pages/%9u} {N:pages active}\n",
 	    sum.v_active_count);
 	xo_emit("{:inactive-pages/%9u} {N:pages inactive}\n",
 	    sum.v_inactive_count);
 	xo_emit("{:laundry-pages/%9u} {N:pages in the laundry queue}\n",
 	    sum.v_laundry_count);
 	xo_emit("{:wired-pages/%9u} {N:pages wired down}\n",
 	    sum.v_wire_count);
 	xo_emit("{:virtual-user-wired-pages/%9lu} {N:virtual user pages wired "
 	    "down}\n", sum.v_user_wire_count);
 	xo_emit("{:free-pages/%9u} {N:pages free}\n",
 	    sum.v_free_count);
 	xo_emit("{:bytes-per-page/%9u} {N:bytes per page}\n", sum.v_page_size);
 	if (kd != NULL) {
 		kread(X_NCHSTATS, &lnchstats, sizeof(lnchstats));
 	} else {
 		size = sizeof(lnchstats);
 		mysysctl("vfs.cache.nchstats", &lnchstats, &size);
 		if (size != sizeof(lnchstats))
 			xo_errx(1, "vfs.cache.nchstats size mismatch");
 	}
 	nchtotal = lnchstats.ncs_goodhits + lnchstats.ncs_neghits +
 	    lnchstats.ncs_badhits + lnchstats.ncs_falsehits +
 	    lnchstats.ncs_miss + lnchstats.ncs_long;
 	xo_emit("{:total-name-lookups/%9ld} {N:total name lookups}\n",
 	    nchtotal);
 	xo_emit("{P:/%9s} {N:cache hits} "
 	    "({:positive-cache-hits/%ld}% pos + "
 	    "{:negative-cache-hits/%ld}% {N:neg}) "
 	    "system {:cache-hit-percent/%ld}% per-directory\n",
 	    "", PCT(lnchstats.ncs_goodhits, nchtotal),
 	    PCT(lnchstats.ncs_neghits, nchtotal),
 	    PCT(lnchstats.ncs_pass2, nchtotal));
 	xo_emit("{P:/%9s} {L:deletions} {:deletions/%ld}%, "
 	    "{L:falsehits} {:false-hits/%ld}%, "
 	    "{L:toolong} {:too-long/%ld}%\n", "",
 	    PCT(lnchstats.ncs_badhits, nchtotal),
 	    PCT(lnchstats.ncs_falsehits, nchtotal),
 	    PCT(lnchstats.ncs_long, nchtotal));
 	xo_close_container("summary-statistics");
 }
 
 static void
 doforkst(void)
 {
 
 	fill_vmmeter(&sum);
 	xo_open_container("fork-statistics");
 	xo_emit("{:fork/%u} {N:forks}, {:fork-pages/%u} {N:pages}, "
 	    "{L:average} {:fork-average/%.2f}\n",
 	    sum.v_forks, sum.v_forkpages,
 	    sum.v_forks == 0 ? 0.0 :
 	    (double)sum.v_forkpages / sum.v_forks);
 	xo_emit("{:vfork/%u} {N:vforks}, {:vfork-pages/%u} {N:pages}, "
 	    "{L:average} {:vfork-average/%.2f}\n",
 	    sum.v_vforks, sum.v_vforkpages,
 	    sum.v_vforks == 0 ? 0.0 :
 	    (double)sum.v_vforkpages / sum.v_vforks);
 	xo_emit("{:rfork/%u} {N:rforks}, {:rfork-pages/%u} {N:pages}, "
 	    "{L:average} {:rfork-average/%.2f}\n",
 	    sum.v_rforks, sum.v_rforkpages,
 	    sum.v_rforks == 0 ? 0.0 :
 	    (double)sum.v_rforkpages / sum.v_rforks);
 	xo_close_container("fork-statistics");
 }
 
 static void
 devstats(void)
 {
 	long double busy_seconds, transfers_per_second;
 	long tmp;
 	int di, dn, state;
 
 	for (state = 0; state < CPUSTATES; ++state) {
 		tmp = cur.cp_time[state];
 		cur.cp_time[state] -= last.cp_time[state];
 		last.cp_time[state] = tmp;
 	}
 
 	busy_seconds = cur.snap_time - last.snap_time;
 
 	xo_open_list("device");
 	for (dn = 0; dn < num_devices; dn++) {
 		if (dev_select[dn].selected == 0 ||
 		    dev_select[dn].selected > maxshowdevs)
 			continue;
 
 		di = dev_select[dn].position;
 
 		if (devstat_compute_statistics(&cur.dinfo->devices[di],
 		    &last.dinfo->devices[di], busy_seconds,
 		    DSM_TRANSFERS_PER_SECOND, &transfers_per_second,
 		    DSM_NONE) != 0)
 			xo_errx(1, "%s", devstat_errbuf);
 
 		xo_open_instance("device");
 		xo_emit("{ekq:name/%c%c%d}{:transfers/%3.0Lf} ",
 		    dev_select[dn].device_name[0],
 		    dev_select[dn].device_name[1],
 		    dev_select[dn].unit_number,
 		    transfers_per_second);
 		xo_close_instance("device");
 	}
 	xo_close_list("device");
 }
 
 static void
 percent(const char *name, double pctv, int *over)
 {
 	int l;
 	char buf[10];
 	char fmt[128];
 
 	snprintf(fmt, sizeof(fmt), " {:%s/%%*s}", name);
 	l = snprintf(buf, sizeof(buf), "%.0f", pctv);
 	if (l == 1 && *over) {
 		xo_emit(fmt, 1, buf);
 		(*over)--;
 	} else
 		xo_emit(fmt, 2, buf);
 	if (l > 2)
 		(*over)++;
 }
 
 static void
 cpustats(void)
 {
 	double lpct, total;
 	int state, over;
 
 	total = 0;
 	for (state = 0; state < CPUSTATES; ++state)
 		total += cur.cp_time[state];
 	if (total > 0)
 		lpct = 100.0 / total;
 	else
 		lpct = 0.0;
 	over = 0;
 	xo_open_container("cpu-statistics");
 	percent("user", (cur.cp_time[CP_USER] + cur.cp_time[CP_NICE]) * lpct,
 	    &over);
 	percent("system", (cur.cp_time[CP_SYS] + cur.cp_time[CP_INTR]) * lpct,
 	    &over);
 	percent("idle", cur.cp_time[CP_IDLE] * lpct, &over);
 	xo_close_container("cpu-statistics");
 }
 
 static void
 pcpustats(u_long cpumask, int maxid)
 {
 	double lpct, total;
 	long tmp;
 	int i, over, state;
 
 	/* devstats does this for cp_time */
 	for (i = 0; i <= maxid; i++) {
 		if ((cpumask & (1ul << i)) == 0)
 			continue;
 		for (state = 0; state < CPUSTATES; ++state) {
 			tmp = cur_cp_times[i * CPUSTATES + state];
 			cur_cp_times[i * CPUSTATES + state] -= last_cp_times[i *
 			    CPUSTATES + state];
 			last_cp_times[i * CPUSTATES + state] = tmp;
 		}
 	}
 
 	over = 0;
 	xo_open_list("cpu");
 	for (i = 0; i <= maxid; i++) {
 		if ((cpumask & (1ul << i)) == 0)
 			continue;
 		xo_open_instance("cpu");
 		xo_emit("{ke:name/%d}", i);
 		total = 0;
 		for (state = 0; state < CPUSTATES; ++state)
 			total += cur_cp_times[i * CPUSTATES + state];
 		if (total)
 			lpct = 100.0 / total;
 		else
 			lpct = 0.0;
 		percent("user", (cur_cp_times[i * CPUSTATES + CP_USER] +
 		    cur_cp_times[i * CPUSTATES + CP_NICE]) * lpct, &over);
 		percent("system", (cur_cp_times[i * CPUSTATES + CP_SYS] +
 		    cur_cp_times[i * CPUSTATES + CP_INTR]) * lpct, &over);
 		percent("idle", cur_cp_times[i * CPUSTATES + CP_IDLE] * lpct,
 		    &over);
 		xo_close_instance("cpu");
 	}
 	xo_close_list("cpu");
 }
 
 static unsigned int
 read_intrcnts(unsigned long **intrcnts)
 {
 	size_t intrcntlen;
 	uintptr_t kaddr;
 
 	if (kd != NULL) {
 		kread(X_SINTRCNT, &intrcntlen, sizeof(intrcntlen));
 		if ((*intrcnts = malloc(intrcntlen)) == NULL)
 			err(1, "malloc()");
 		if (namelist[X_NINTRCNT].n_type == 0)
 			kread(X_INTRCNT, *intrcnts, intrcntlen);
 		else {
 			kread(X_INTRCNT, &kaddr, sizeof(kaddr));
 			kreadptr(kaddr, *intrcnts, intrcntlen);
 		}
 	} else {
 		for (*intrcnts = NULL, intrcntlen = 1024; ; intrcntlen *= 2) {
 			*intrcnts = reallocf(*intrcnts, intrcntlen);
 			if (*intrcnts == NULL)
 				err(1, "reallocf()");
 			if (mysysctl("hw.intrcnt", *intrcnts, &intrcntlen) == 0)
 				break;
 		}
 	}
 
 	return (intrcntlen / sizeof(unsigned long));
 }
 
 static void
 print_intrcnts(unsigned long *intrcnts, unsigned long *old_intrcnts,
     char *intrnames, unsigned int nintr, size_t istrnamlen, long long period_ms)
 {
 	unsigned long *intrcnt, *old_intrcnt;
 	char *intrname;
 	uint64_t inttotal, old_inttotal, total_count, total_rate;
 	unsigned long count, rate;
 	unsigned int i;
 
 	inttotal = 0;
 	old_inttotal = 0;
 	intrname = intrnames;
 	xo_open_list("interrupt");
 	for (i = 0, intrcnt=intrcnts, old_intrcnt=old_intrcnts; i < nintr; i++) {
 		if (intrname[0] != '\0' && (*intrcnt != 0 || aflag)) {
 			count = *intrcnt - *old_intrcnt;
 			rate = ((uint64_t)count * 1000 + period_ms / 2) / period_ms;
 			xo_open_instance("interrupt");
 			xo_emit("{d:name/%-*s}{ket:name/%s} "
 			    "{:total/%20lu} {:rate/%10lu}\n",
 			    (int)istrnamlen, intrname, intrname, count, rate);
 			xo_close_instance("interrupt");
 		}
 		intrname += strlen(intrname) + 1;
 		inttotal += *intrcnt++;
 		old_inttotal += *old_intrcnt++;
 	}
 	total_count = inttotal - old_inttotal;
 	total_rate = (total_count * 1000 + period_ms / 2) / period_ms;
 	xo_close_list("interrupt");
 	xo_emit("{L:/%-*s} {:total-interrupts/%20ju} "
 	    "{:total-rate/%10ju}\n", (int)istrnamlen,
 	    "Total", (uintmax_t)total_count, (uintmax_t)total_rate);
 }
 
 static void
 dointr(unsigned int interval, int reps)
 {
 	unsigned long *intrcnts, *old_intrcnts;
 	char *intrname, *intrnames;
 	long long period_ms, old_uptime, uptime;
 	size_t clen, inamlen, istrnamlen;
 	uintptr_t kaddr;
 	unsigned int nintr;
 
 	old_intrcnts = NULL;
 	uptime = getuptime();
 
 	/* Get the names of each interrupt source */
 	if (kd != NULL) {
 		kread(X_SINTRNAMES, &inamlen, sizeof(inamlen));
 		if ((intrnames = malloc(inamlen)) == NULL)
 			xo_err(1, "malloc()");
 		if (namelist[X_NINTRCNT].n_type == 0)
 			kread(X_INTRNAMES, intrnames, inamlen);
 		else {
 			kread(X_INTRNAMES, &kaddr, sizeof(kaddr));
 			kreadptr(kaddr, intrnames, inamlen);
 		}
 	} else {
 		for (intrnames = NULL, inamlen = 1024; ; inamlen *= 2) {
 			if ((intrnames = reallocf(intrnames, inamlen)) == NULL)
 				xo_err(1, "reallocf()");
 			if (mysysctl("hw.intrnames", intrnames, &inamlen) == 0)
 				break;
 		}
 	}
 
 	/* Determine the length of the longest interrupt name */
 	intrname = intrnames;
 	istrnamlen = strlen("interrupt");
 	while(*intrname != '\0') {
 		clen = strlen(intrname);
 		if (clen > istrnamlen)
 			istrnamlen = clen;
 		intrname += strlen(intrname) + 1;
 	}
 	xo_emit("{T:/%-*s} {T:/%20s} {T:/%10s}\n",
 	    (int)istrnamlen, "interrupt", "total", "rate");
 
 	/* 
 	 * Loop reps times printing differential interrupt counts.  If reps is
 	 * zero, then run just once, printing total counts
 	 */
 	xo_open_container("interrupt-statistics");
 
 	period_ms = uptime / 1000000;
 	while(1) {
 		nintr = read_intrcnts(&intrcnts);
 		/* 
 		 * Initialize old_intrcnts to 0 for the first pass, so
 		 * print_intrcnts will print total interrupts since boot
 		 */
 		if (old_intrcnts == NULL) {
 			old_intrcnts = calloc(nintr, sizeof(unsigned long));
 			if (old_intrcnts == NULL)
 				xo_err(1, "calloc()");
 		}
 
 		print_intrcnts(intrcnts, old_intrcnts, intrnames, nintr,
 		    istrnamlen, period_ms);
 		xo_flush();
 
 		free(old_intrcnts);
 		old_intrcnts = intrcnts;
 		if (reps >= 0 && --reps <= 0)
 			break;
 		usleep(interval * 1000);
 		old_uptime = uptime;
 		uptime = getuptime();
 		period_ms = (uptime - old_uptime) / 1000000;
 	}
 
 	xo_close_container("interrupt-statistics");
 }
 
 static void
 domemstat_malloc(void)
 {
 	struct memory_type_list *mtlp;
 	struct memory_type *mtp;
 	int error, first, i;
 
 	mtlp = memstat_mtl_alloc();
 	if (mtlp == NULL) {
 		xo_warn("memstat_mtl_alloc");
 		return;
 	}
 	if (kd == NULL) {
 		if (memstat_sysctl_malloc(mtlp, 0) < 0) {
 			xo_warnx("memstat_sysctl_malloc: %s",
 			    memstat_strerror(memstat_mtl_geterror(mtlp)));
 			return;
 		}
 	} else {
 		if (memstat_kvm_malloc(mtlp, kd) < 0) {
 			error = memstat_mtl_geterror(mtlp);
 			if (error == MEMSTAT_ERROR_KVM)
 				xo_warnx("memstat_kvm_malloc: %s",
 				    kvm_geterr(kd));
 			else
 				xo_warnx("memstat_kvm_malloc: %s",
 				    memstat_strerror(error));
 		}
 	}
 	xo_open_container("malloc-statistics");
 	xo_emit("{T:/%13s} {T:/%5s} {T:/%6s} {T:/%7s} {T:/%8s}  {T:Size(s)}\n",
 	    "Type", "InUse", "MemUse", "HighUse", "Requests");
 	xo_open_list("memory");
 	for (mtp = memstat_mtl_first(mtlp); mtp != NULL;
 	    mtp = memstat_mtl_next(mtp)) {
 		if (memstat_get_numallocs(mtp) == 0 &&
 		    memstat_get_count(mtp) == 0)
 			continue;
 		xo_open_instance("memory");
 		xo_emit("{k:type/%13s/%s} {:in-use/%5ju} "
 		    "{:memory-use/%5ju}{U:K} {:high-use/%7s} "
 		    "{:requests/%8ju}  ",
 		    memstat_get_name(mtp), (uintmax_t)memstat_get_count(mtp),
 		    ((uintmax_t)memstat_get_bytes(mtp) + 1023) / 1024, "-",
 		    (uintmax_t)memstat_get_numallocs(mtp));
 		first = 1;
 		xo_open_list("size");
 		for (i = 0; i < 32; i++) {
 			if (memstat_get_sizemask(mtp) & (1 << i)) {
 				if (!first)
 					xo_emit(",");
 				xo_emit("{l:size/%d}", 1 << (i + 4));
 				first = 0;
 			}
 		}
 		xo_close_list("size");
 		xo_close_instance("memory");
 		xo_emit("\n");
 	}
 	xo_close_list("memory");
 	xo_close_container("malloc-statistics");
 	memstat_mtl_free(mtlp);
 }
 
 static void
 domemstat_zone(void)
 {
 	struct memory_type_list *mtlp;
 	struct memory_type *mtp;
 	int error;
 	char name[MEMTYPE_MAXNAME + 1];
 
 	mtlp = memstat_mtl_alloc();
 	if (mtlp == NULL) {
 		xo_warn("memstat_mtl_alloc");
 		return;
 	}
 	if (kd == NULL) {
 		if (memstat_sysctl_uma(mtlp, 0) < 0) {
 			xo_warnx("memstat_sysctl_uma: %s",
 			    memstat_strerror(memstat_mtl_geterror(mtlp)));
 			return;
 		}
 	} else {
 		if (memstat_kvm_uma(mtlp, kd) < 0) {
 			error = memstat_mtl_geterror(mtlp);
 			if (error == MEMSTAT_ERROR_KVM)
 				xo_warnx("memstat_kvm_uma: %s",
 				    kvm_geterr(kd));
 			else
 				xo_warnx("memstat_kvm_uma: %s",
 				    memstat_strerror(error));
 		}
 	}
 	xo_open_container("memory-zone-statistics");
-	xo_emit("{T:/%-20s} {T:/%6s} {T:/%6s} {T:/%8s} {T:/%8s} {T:/%8s} "
+	xo_emit("{T:/%-20s} {T:/%6s} {T:/%6s} {T:/%8s} {T:/%8s} {T:/%8s} {T:/%8s}"
 	    "{T:/%4s} {T:/%4s}\n\n", "ITEM", "SIZE",
-	    "LIMIT", "USED", "FREE", "REQ", "FAIL", "SLEEP");
+	    "LIMIT", "USED", "FREE", "REQ", "FAIL", "SLEEP", "XDOMAIN");
 	xo_open_list("zone");
 	for (mtp = memstat_mtl_first(mtlp); mtp != NULL;
 	    mtp = memstat_mtl_next(mtp)) {
 		strlcpy(name, memstat_get_name(mtp), MEMTYPE_MAXNAME);
 		strcat(name, ":");
 		xo_open_instance("zone");
 		xo_emit("{d:name/%-20s}{ke:name/%s} {:size/%6ju}, "
 		    "{:limit/%6ju},{:used/%8ju},"
 		    "{:free/%8ju},{:requests/%8ju},"
-		    "{:fail/%4ju},{:sleep/%4ju}\n", name,
+		    "{:fail/%4ju},{:sleep/%4ju},{:xdomain/%4ju}\n", name,
 		    memstat_get_name(mtp),
 		    (uintmax_t)memstat_get_size(mtp),
 		    (uintmax_t)memstat_get_countlimit(mtp),
 		    (uintmax_t)memstat_get_count(mtp),
 		    (uintmax_t)memstat_get_free(mtp),
 		    (uintmax_t)memstat_get_numallocs(mtp),
 		    (uintmax_t)memstat_get_failures(mtp),
-		    (uintmax_t)memstat_get_sleeps(mtp));
+		    (uintmax_t)memstat_get_sleeps(mtp),
+		    (uintmax_t)memstat_get_xdomain(mtp));
 		xo_close_instance("zone");
 	}
 	memstat_mtl_free(mtlp);
 	xo_close_list("zone");
 	xo_close_container("memory-zone-statistics");
 	xo_emit("\n");
 }
 
 static void
 display_object(struct kinfo_vmobject *kvo)
 {
 	const char *str;
 
 	xo_open_instance("object");
 	xo_emit("{:resident/%5ju} ", (uintmax_t)kvo->kvo_resident);
 	xo_emit("{:active/%5ju} ", (uintmax_t)kvo->kvo_active);
 	xo_emit("{:inactive/%5ju} ", (uintmax_t)kvo->kvo_inactive);
 	xo_emit("{:refcount/%3d} ", kvo->kvo_ref_count);
 	xo_emit("{:shadowcount/%3d} ", kvo->kvo_shadow_count);
 	switch (kvo->kvo_memattr) {
 #ifdef VM_MEMATTR_UNCACHEABLE
 	case VM_MEMATTR_UNCACHEABLE:
 		str = "UC";
 		break;
 #endif
 #ifdef VM_MEMATTR_WRITE_COMBINING
 	case VM_MEMATTR_WRITE_COMBINING:
 		str = "WC";
 		break;
 #endif
 #ifdef VM_MEMATTR_WRITE_THROUGH
 	case VM_MEMATTR_WRITE_THROUGH:
 		str = "WT";
 		break;
 #endif
 #ifdef VM_MEMATTR_WRITE_PROTECTED
 	case VM_MEMATTR_WRITE_PROTECTED:
 		str = "WP";
 		break;
 #endif
 #ifdef VM_MEMATTR_WRITE_BACK
 	case VM_MEMATTR_WRITE_BACK:
 		str = "WB";
 		break;
 #endif
 #ifdef VM_MEMATTR_WEAK_UNCACHEABLE
 	case VM_MEMATTR_WEAK_UNCACHEABLE:
 		str = "UC-";
 		break;
 #endif
 #ifdef VM_MEMATTR_WB_WA
 	case VM_MEMATTR_WB_WA:
 		str = "WB";
 		break;
 #endif
 #ifdef VM_MEMATTR_NOCACHE
 	case VM_MEMATTR_NOCACHE:
 		str = "NC";
 		break;
 #endif
 #ifdef VM_MEMATTR_DEVICE
 	case VM_MEMATTR_DEVICE:
 		str = "DEV";
 		break;
 #endif
 #ifdef VM_MEMATTR_CACHEABLE
 	case VM_MEMATTR_CACHEABLE:
 		str = "C";
 		break;
 #endif
 #ifdef VM_MEMATTR_PREFETCHABLE
 	case VM_MEMATTR_PREFETCHABLE:
 		str = "PRE";
 		break;
 #endif
 	default:
 		str = "??";
 		break;
 	}
 	xo_emit("{:attribute/%-3s} ", str);
 	switch (kvo->kvo_type) {
 	case KVME_TYPE_NONE:
 		str = "--";
 		break;
 	case KVME_TYPE_DEFAULT:
 		str = "df";
 		break;
 	case KVME_TYPE_VNODE:
 		str = "vn";
 		break;
 	case KVME_TYPE_SWAP:
 		str = "sw";
 		break;
 	case KVME_TYPE_DEVICE:
 		str = "dv";
 		break;
 	case KVME_TYPE_PHYS:
 		str = "ph";
 		break;
 	case KVME_TYPE_DEAD:
 		str = "dd";
 		break;
 	case KVME_TYPE_SG:
 		str = "sg";
 		break;
 	case KVME_TYPE_MGTDEVICE:
 		str = "md";
 		break;
 	case KVME_TYPE_UNKNOWN:
 	default:
 		str = "??";
 		break;
 	}
 	xo_emit("{:type/%-2s} ", str);
 	xo_emit("{:path/%-s}\n", kvo->kvo_path);
 	xo_close_instance("object");
 }
 
 static void
 doobjstat(void)
 {
 	struct kinfo_vmobject *kvo;
 	int cnt, i;
 
 	kvo = kinfo_getvmobject(&cnt);
 	if (kvo == NULL) {
 		xo_warn("Failed to fetch VM object list");
 		return;
 	}
 	xo_emit("{T:RES/%5s} {T:ACT/%5s} {T:INACT/%5s} {T:REF/%3s} {T:SHD/%3s} "
 	    "{T:CM/%3s} {T:TP/%2s} {T:PATH/%s}\n");
 	xo_open_list("object");
 	for (i = 0; i < cnt; i++)
 		display_object(&kvo[i]);
 	free(kvo);
 	xo_close_list("object");
 }
 
 /*
  * kread reads something from the kernel, given its nlist index.
  */
 static void
 kreado(int nlx, void *addr, size_t size, size_t offset)
 {
 	const char *sym;
 
 	if (namelist[nlx].n_type == 0 || namelist[nlx].n_value == 0) {
 		sym = namelist[nlx].n_name;
 		if (*sym == '_')
 			++sym;
 		xo_errx(1, "symbol %s not defined", sym);
 	}
 	if ((size_t)kvm_read(kd, namelist[nlx].n_value + offset, addr,
 	    size) != size) {
 		sym = namelist[nlx].n_name;
 		if (*sym == '_')
 			++sym;
 		xo_errx(1, "%s: %s", sym, kvm_geterr(kd));
 	}
 }
 
 static void
 kread(int nlx, void *addr, size_t size)
 {
 
 	kreado(nlx, addr, size, 0);
 }
 
 static void
 kreadptr(uintptr_t addr, void *buf, size_t size)
 {
 
 	if ((size_t)kvm_read(kd, addr, buf, size) != size)
 		xo_errx(1, "%s", kvm_geterr(kd));
 }
 
 static void __dead2
 usage(void)
 {
 	xo_error("%s%s",
 	    "usage: vmstat [-afHhimoPsz] [-M core [-N system]] [-c count] [-n devs]\n",
 	    "              [-p type,if,pass] [-w wait] [disks] [wait [count]]\n");
 	xo_finish();
 	exit(1);
 }