Index: head/share/man/man9/malloc.9
===================================================================
--- head/share/man/man9/malloc.9	(revision 335067)
+++ head/share/man/man9/malloc.9	(revision 335068)
@@ -1,311 +1,316 @@
 .\"
 .\" Copyright (c) 1996 The NetBSD Foundation, Inc.
 .\" All rights reserved.
 .\"
 .\" This code is derived from software contributed to The NetBSD Foundation
 .\" by Paul Kranenburg.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 .\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 .\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 .\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
 .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" $NetBSD: malloc.9,v 1.3 1996/11/11 00:05:11 lukem Exp $
 .\" $FreeBSD$
 .\"
-.Dd January 24, 2018
+.Dd June 13, 2018
 .Dt MALLOC 9
 .Os
 .Sh NAME
 .Nm malloc ,
 .Nm free ,
 .Nm realloc ,
 .Nm reallocf ,
 .Nm MALLOC_DEFINE ,
 .Nm MALLOC_DECLARE
 .Nd kernel memory management routines
 .Sh SYNOPSIS
 .In sys/types.h
 .In sys/malloc.h
 .Ft void *
 .Fn malloc "size_t size" "struct malloc_type *type" "int flags"
 .Ft void *
 .Fn malloc_domain "size_t size" "struct malloc_type *type" "int domain" "int flags"
 .Ft void *
 .Fn mallocarray "size_t nmemb" "size_t size" "struct malloc_type *type" "int flags"
 .Ft void
 .Fn free "void *addr" "struct malloc_type *type"
 .Ft void
 .Fn free_domain "void *addr" "struct malloc_type *type"
 .Ft void *
 .Fn realloc "void *addr" "size_t size" "struct malloc_type *type" "int flags"
 .Ft void *
 .Fn reallocf "void *addr" "size_t size" "struct malloc_type *type" "int flags"
 .Fn MALLOC_DECLARE type
 .In sys/param.h
 .In sys/malloc.h
 .In sys/kernel.h
 .Fn MALLOC_DEFINE type shortdesc longdesc
 .Sh DESCRIPTION
 The
 .Fn malloc
 function allocates uninitialized memory in kernel address space for an
 object whose size is specified by
 .Fa size .
 .Pp
 The
 .Fn malloc_domain
 variant allocates the object from the specified memory domain.  Memory allocated
 with this function should be returned with
 .Fn free_domain .
 See
 .Xr numa 9 for more details.
 .Pp
 The
 .Fn mallocarray
 function allocates uninitialized memory in kernel address space for an
 array of
 .Fa nmemb
 entries whose size is specified by
 .Fa size .
 .Pp
 The
 .Fn free
 function releases memory at address
 .Fa addr
 that was previously allocated by
 .Fn malloc
 for re-use.
 The memory is not zeroed.
 If
 .Fa addr
 is
 .Dv NULL ,
 then
 .Fn free
 does nothing.
 .Pp
 The
 .Fn realloc
 function changes the size of the previously allocated memory referenced by
 .Fa addr
 to
 .Fa size
 bytes.
 The contents of the memory are unchanged up to the lesser of the new and
 old sizes.
 Note that the returned value may differ from
 .Fa addr .
 If the requested memory cannot be allocated,
 .Dv NULL
 is returned and the memory referenced by
 .Fa addr
 is valid and unchanged.
 If
 .Fa addr
 is
 .Dv NULL ,
 the
 .Fn realloc
 function behaves identically to
 .Fn malloc
 for the specified size.
 .Pp
 The
 .Fn reallocf
 function is identical to
 .Fn realloc
 except that it
 will free the passed pointer when the requested memory cannot be allocated.
 .Pp
 Unlike its standard C library counterpart
 .Pq Xr malloc 3 ,
 the kernel version takes two more arguments.
 The
 .Fa flags
 argument further qualifies
 .Fn malloc Ns 's
 operational characteristics as follows:
 .Bl -tag -width indent
 .It Dv M_ZERO
 Causes the allocated memory to be set to all zeros.
 .It Dv M_NODUMP
 For allocations greater than page size, causes the allocated
 memory to be excluded from kernel core dumps.
 .It Dv M_NOWAIT
 Causes
 .Fn malloc ,
 .Fn realloc ,
 and
 .Fn reallocf
 to return
 .Dv NULL
 if the request cannot be immediately fulfilled due to resource shortage.
 Note that
 .Dv M_NOWAIT
 is required when running in an interrupt context.
 .It Dv M_WAITOK
 Indicates that it is OK to wait for resources.
 If the request cannot be immediately fulfilled, the current process is put
 to sleep to wait for resources to be released by other processes.
 The
 .Fn malloc ,
 .Fn mallocarray ,
 .Fn realloc ,
 and
 .Fn reallocf
 functions cannot return
 .Dv NULL
 if
 .Dv M_WAITOK
 is specified.
 If the multiplication of
 .Fa nmemb
 and
 .Fa size
 would cause an integer overflow, the
 .Fn mallocarray
 function induces a panic.
 .It Dv M_USE_RESERVE
 Indicates that the system can use its reserve of memory to satisfy the
 request.
 This option should only be used in combination with
 .Dv M_NOWAIT
 when an allocation failure cannot be tolerated by the caller without
 catastrophic effects on the system.
+.It Dv M_EXEC
+Indicates that the system should allocate executable memory.
+If this flag is not set, the system will not allocate executable memory.
+Not all platforms enforce a distinction between executable and
+non-executable memory.
 .El
 .Pp
 Exactly one of either
 .Dv M_WAITOK
 or
 .Dv M_NOWAIT
 must be specified.
 .Pp
 The
 .Fa type
 argument is used to perform statistics on memory usage, and for
 basic sanity checks.
 It can be used to identify multiple allocations.
 The statistics can be examined by
 .Sq vmstat -m .
 .Pp
 A
 .Fa type
 is defined using
 .Vt "struct malloc_type"
 via the
 .Fn MALLOC_DECLARE
 and
 .Fn MALLOC_DEFINE
 macros.
 .Bd -literal -offset indent
 /* sys/something/foo_extern.h */
 
 MALLOC_DECLARE(M_FOOBUF);
 
 /* sys/something/foo_main.c */
 
 MALLOC_DEFINE(M_FOOBUF, "foobuffers", "Buffers to foo data into the ether");
 
 /* sys/something/foo_subr.c */
 
 \&...
 buf = malloc(sizeof(*buf), M_FOOBUF, M_NOWAIT);
 
 .Ed
 .Pp
 In order to use
 .Fn MALLOC_DEFINE ,
 one must include
 .In sys/param.h
 (instead of
 .In sys/types.h )
 and
 .In sys/kernel.h .
 .Sh CONTEXT
 .Fn malloc ,
 .Fn realloc
 and
 .Fn reallocf
 may not be called from fast interrupts handlers.
 When called from threaded interrupts,
 .Fa flags
 must contain
 .Dv M_NOWAIT .
 .Pp
 .Fn malloc ,
 .Fn realloc
 and
 .Fn reallocf
 may sleep when called with
 .Dv M_WAITOK .
 .Fn free
 never sleeps.
 However,
 .Fn malloc ,
 .Fn realloc ,
 .Fn reallocf
 and
 .Fn free
 may not be called in a critical section or while holding a spin lock.
 .Pp
 Any calls to
 .Fn malloc
 (even with
 .Dv M_NOWAIT )
 or
 .Fn free
 when holding a
 .Xr vnode 9
 interlock, will cause a LOR (Lock Order Reversal) due to the
 intertwining of VM Objects and Vnodes.
 .Sh IMPLEMENTATION NOTES
 The memory allocator allocates memory in chunks that have size a power
 of two for requests up to the size of a page of memory.
 For larger requests, one or more pages is allocated.
 While it should not be relied upon, this information may be useful for
 optimizing the efficiency of memory use.
 .Sh RETURN VALUES
 The
 .Fn malloc ,
 .Fn realloc ,
 and
 .Fn reallocf
 functions return a kernel virtual address that is suitably aligned for
 storage of any type of object, or
 .Dv NULL
 if the request could not be satisfied (implying that
 .Dv M_NOWAIT
 was set).
 .Sh DIAGNOSTICS
 A kernel compiled with the
 .Dv INVARIANTS
 configuration option attempts to detect memory corruption caused by
 such things as writing outside the allocated area and imbalanced calls to the
 .Fn malloc
 and
 .Fn free
 functions.
 Failing consistency checks will cause a panic or a system console
 message.
 .Sh SEE ALSO
 .Xr vmstat 8 ,
 .Xr contigmalloc 9 ,
 .Xr memguard 9 ,
 .Xr vnode 9
Index: head/share/man/man9/zone.9
===================================================================
--- head/share/man/man9/zone.9	(revision 335067)
+++ head/share/man/man9/zone.9	(revision 335068)
@@ -1,397 +1,406 @@
 .\"-
 .\" Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd April 26, 2017
+.Dd June 13, 2018
 .Dt ZONE 9
 .Os
 .Sh NAME
 .Nm uma_zcreate ,
 .Nm uma_zalloc ,
 .Nm uma_zalloc_arg ,
 .Nm uma_zalloc_domain ,
 .Nm uma_zfree ,
 .Nm uma_zfree_arg ,
 .Nm uma_zfree_domain ,
 .Nm uma_zdestroy ,
 .Nm uma_zone_set_max ,
 .Nm uma_zone_get_max ,
 .Nm uma_zone_get_cur ,
 .Nm uma_zone_set_warning ,
 .Nm uma_zone_set_maxaction
 .Nd zone allocator
 .Sh SYNOPSIS
 .In sys/param.h
 .In sys/queue.h
 .In vm/uma.h
 .Ft uma_zone_t
 .Fo uma_zcreate
 .Fa "char *name" "int size"
 .Fa "uma_ctor ctor" "uma_dtor dtor" "uma_init uminit" "uma_fini fini"
 .Fa "int align" "uint16_t flags"
 .Fc
 .Ft "void *"
 .Fn uma_zalloc "uma_zone_t zone" "int flags"
 .Ft "void *"
 .Fn uma_zalloc_arg "uma_zone_t zone" "void *arg" "int flags"
 .Ft "void *"
 .Fn uma_zalloc_domain "uma_zone_t zone" "void *arg" "int domain" "int flags"
 .Ft void
 .Fn uma_zfree "uma_zone_t zone" "void *item"
 .Ft void
 .Fn uma_zfree_arg "uma_zone_t zone" "void *item" "void *arg"
 .Ft void
 .Fn uma_zfree_domain "uma_zone_t zone" "void *item" "void *arg"
 .Ft void
 .Fn uma_zdestroy "uma_zone_t zone"
 .Ft int
 .Fn uma_zone_set_max "uma_zone_t zone" "int nitems"
 .Ft int
 .Fn uma_zone_get_max "uma_zone_t zone"
 .Ft int
 .Fn uma_zone_get_cur "uma_zone_t zone"
 .Ft void
 .Fn uma_zone_set_warning "uma_zone_t zone" "const char *warning"
 .Ft void
 .Fn uma_zone_set_maxaction "uma_zone_t zone" "void (*maxaction)(uma_zone_t)"
 .In sys/sysctl.h
 .Fn SYSCTL_UMA_MAX parent nbr name access zone descr
 .Fn SYSCTL_ADD_UMA_MAX ctx parent nbr name access zone descr
 .Fn SYSCTL_UMA_CUR parent nbr name access zone descr
 .Fn SYSCTL_ADD_UMA_CUR ctx parent nbr name access zone descr
 .Sh DESCRIPTION
 The zone allocator provides an efficient interface for managing
 dynamically-sized collections of items of identical size.
 The zone allocator can work with preallocated zones as well as with
 runtime-allocated ones, and is therefore available much earlier in the
 boot process than other memory management routines.  The zone allocator
 provides per-cpu allocation caches with linear scalability on SMP
 systems as well as round-robin and first-touch policies for NUMA
 systems.
 .Pp
 A zone is an extensible collection of items of identical size.
 The zone allocator keeps track of which items are in use and which
 are not, and provides functions for allocating items from the zone and
 for releasing them back (which makes them available for later use).
 .Pp
 After the first allocation of an item,
 it will have been cleared to zeroes, however subsequent allocations
 will retain the contents as of the last free.
 .Pp
 The
 .Fn uma_zcreate
 function creates a new zone from which items may then be allocated from.
 The
 .Fa name
 argument is a text name of the zone for debugging and stats; this memory
 should not be freed until the zone has been deallocated.
 .Pp
 The
 .Fa ctor
 and
 .Fa dtor
 arguments are callback functions that are called by
 the uma subsystem at the time of the call to
 .Fn uma_zalloc
 and
 .Fn uma_zfree
 respectively.
 Their purpose is to provide hooks for initializing or
 destroying things that need to be done at the time of the allocation
 or release of a resource.
 A good usage for the
 .Fa ctor
 and
 .Fa dtor
 callbacks
 might be to adjust a global count of the number of objects allocated.
 .Pp
 The
 .Fa uminit
 and
 .Fa fini
 arguments are used to optimize the allocation of
 objects from the zone.
 They are called by the uma subsystem whenever
 it needs to allocate or free several items to satisfy requests or memory
 pressure.
 A good use for the
 .Fa uminit
 and
 .Fa fini
 callbacks might be to
 initialize and destroy mutexes contained within the object.
 This would
 allow one to re-use already initialized mutexes when an object is returned
 from the uma subsystem's object cache.
 They are not called on each call to
 .Fn uma_zalloc
 and
 .Fn uma_zfree
 but rather in a batch mode on several objects.
 .Pp
 The
 .Fa flags
 argument of the
 .Fn uma_zcreate
 is a subset of the following flags:
 .Bl -tag -width "foo"
 .It Dv UMA_ZONE_NOFREE
 Slabs of the zone are never returned back to VM.
 .It Dv UMA_ZONE_NODUMP
 Pages belonging to the zone will not be included into mini-dumps.
 .It Dv UMA_ZONE_PCPU
 An allocation from zone would have
 .Va mp_ncpu
 shadow copies, that are privately assigned to CPUs.
 A CPU can address its private copy using base allocation address plus
 multiple of current CPU id and
 .Fn sizeof "struct pcpu" :
 .Bd -literal -offset indent
 foo_zone = uma_zcreate(..., UMA_ZONE_PCPU);
  ...
 foo_base = uma_zalloc(foo_zone, ...);
  ...
 critical_enter();
 foo_pcpu = (foo_t *)zpcpu_get(foo_base);
 /* do something with foo_pcpu */
 critical_exit();
 .Ed
 .It Dv UMA_ZONE_OFFPAGE
 By default book-keeping of items within a slab is done in the slab page itself.
 This flag explicitly tells subsystem that book-keeping structure should be
 allocated separately from special internal zone.
 This flag requires either
 .Dv UMA_ZONE_VTOSLAB
 or
 .Dv UMA_ZONE_HASH ,
 since subsystem requires a mechanism to find a book-keeping structure
 to an item being freed.
 The subsystem may choose to prefer offpage book-keeping for certain zones
 implicitly.
 .It Dv UMA_ZONE_ZINIT
 The zone will have its
 .Ft uma_init
 method set to internal method that initializes a new allocated slab
 to all zeros.
 Do not mistake
 .Ft uma_init
 method with
 .Ft uma_ctor .
 A zone with
 .Dv UMA_ZONE_ZINIT
 flag would not return zeroed memory on every
 .Fn uma_zalloc .
 .It Dv UMA_ZONE_HASH
 The zone should use an internal hash table to find slab book-keeping
 structure where an allocation being freed belongs to.
 .It Dv UMA_ZONE_VTOSLAB
 The zone should use special field of
 .Vt vm_page_t
 to find slab book-keeping structure where an allocation being freed belongs to.
 .It Dv UMA_ZONE_MALLOC
 The zone is for the
 .Xr malloc 9
 subsystem.
 .It Dv UMA_ZONE_VM
 The zone is for the VM subsystem.
 .It Dv UMA_ZONE_NUMA
 The zone should use a first-touch NUMA policy rather than the round-robin
 default. Callers that do not free memory on the same domain it is allocated
 from will cause mixing in per-cpu caches.  See
 .Xr numa 9 for more details.
 .El
 .Pp
 To allocate an item from a zone, simply call
 .Fn uma_zalloc
 with a pointer to that zone
 and set the
 .Fa flags
 argument to selected flags as documented in
 .Xr malloc 9 .
 It will return a pointer to an item if successful,
 or
 .Dv NULL
 in the rare case where all items in the zone are in use and the
 allocator is unable to grow the zone
 and
 .Dv M_NOWAIT
 is specified.
 .Pp
 Items are released back to the zone from which they were allocated by
 calling
 .Fn uma_zfree
 with a pointer to the zone and a pointer to the item.
 If
 .Fa item
 is
 .Dv NULL ,
 then
 .Fn uma_zfree
 does nothing.
 .Pp
 The variations
 .Fn uma_zalloc_arg
 and
 .Fn uma_zfree_arg
 allow callers to
 specify an argument for the
 .Dv ctor
 and
 .Dv dtor
 functions, respectively.
 The 
 .Fn uma_zalloc_domain
 function allows callers to specify a fixed
 .Xr numa 9 domain to allocate from.  This uses a guaranteed but slow path in
 the allocator which reduces concurrency.  The 
 .Fn uma_zfree_domain
 function should be used to return memory allocated in this fashion.  This
 function infers the domain from the pointer and does not require it as an
 argument.
 .Pp
 Created zones,
 which are empty,
 can be destroyed using
 .Fn uma_zdestroy ,
 freeing all memory that was allocated for the zone.
 All items allocated from the zone with
 .Fn uma_zalloc
 must have been freed with
 .Fn uma_zfree
 before.
 .Pp
 The
 .Fn uma_zone_set_max
 function limits the number of items
 .Pq and therefore memory
 that can be allocated to
 .Fa zone .
 The
 .Fa nitems
 argument specifies the requested upper limit number of items.
 The effective limit is returned to the caller, as it may end up being higher
 than requested due to the implementation rounding up to ensure all memory pages
 allocated to the zone are utilised to capacity.
 The limit applies to the total number of items in the zone, which includes
 allocated items, free items and free items in the per-cpu caches.
 On systems with more than one CPU it may not be possible to allocate
 the specified number of items even when there is no shortage of memory,
 because all of the remaining free items may be in the caches of the
 other CPUs when the limit is hit.
 .Pp
 The
 .Fn uma_zone_get_max
 function returns the effective upper limit number of items for a zone.
 .Pp
 The
 .Fn uma_zone_get_cur
 function returns the approximate current occupancy of the zone.
 The returned value is approximate because appropriate synchronisation to
 determine an exact value is not performed by the implementation.
 This ensures low overhead at the expense of potentially stale data being used
 in the calculation.
 .Pp
 The
 .Fn uma_zone_set_warning
 function sets a warning that will be printed on the system console when the
 given zone becomes full and fails to allocate an item.
 The warning will be printed no more often than every five minutes.
 Warnings can be turned off globally by setting the
 .Va vm.zone_warnings
 sysctl tunable to
 .Va 0 .
 .Pp
 The
 .Fn uma_zone_set_maxaction
 function sets a function that will be called when the given zone becomes full
 and fails to allocate an item.
 The function will be called with the zone locked.
 Also, the function
 that called the allocation function may have held additional locks.
 Therefore,
 this function should do very little work (similar to a signal handler).
 .Pp
 The
 .Fn SYSCTL_UMA_MAX parent nbr name access zone descr
 macro declares a static
 .Xr sysctl
 oid that exports the effective upper limit number of items for a zone.
 The
 .Fa zone
 argument should be a pointer to
 .Vt uma_zone_t .
 A read of the oid returns value obtained through
 .Fn uma_zone_get_max .
 A write to the oid sets new value via
 .Fn uma_zone_set_max .
 The
 .Fn SYSCTL_ADD_UMA_MAX ctx parent nbr name access zone descr
 macro is provided to create this type of oid dynamically.
 .Pp
 The
 .Fn SYSCTL_UMA_CUR parent nbr name access zone descr
 macro declares a static read-only
 .Xr sysctl
 oid that exports the approximate current occupancy of the zone.
 The
 .Fa zone
 argument should be a pointer to
 .Vt uma_zone_t .
 A read of the oid returns value obtained through
 .Fn uma_zone_get_cur .
 The
 .Fn SYSCTL_ADD_UMA_CUR ctx parent nbr name zone descr
 macro is provided to create this type of oid dynamically.
 .Sh RETURN VALUES
 The
 .Fn uma_zalloc
 function returns a pointer to an item, or
 .Dv NULL
 if the zone ran out of unused items
 and
 .Dv M_NOWAIT
 was specified.
+.Sh IMPLEMENTATION NOTES
+The memory that these allocation calls return is not executable.
+The
+.Fn uma_zalloc
+function does not support the
+.Dv M_EXEC
+flag to allocate executable memory.
+Not all platforms enforce a distinction between executable and
+non-executable memory.
 .Sh SEE ALSO
 .Xr malloc 9
 .Sh HISTORY
 The zone allocator first appeared in
 .Fx 3.0 .
 It was radically changed in
 .Fx 5.0
 to function as a slab allocator.
 .Sh AUTHORS
 .An -nosplit
 The zone allocator was written by
 .An John S. Dyson .
 The zone allocator was rewritten in large parts by
 .An Jeff Roberson Aq Mt jeff@FreeBSD.org
 to function as a slab allocator.
 .Pp
 This manual page was written by
 .An Dag-Erling Sm\(/orgrav Aq Mt des@FreeBSD.org .
 Changes for UMA by
 .An Jeroen Ruigrok van der Werven Aq Mt asmodai@FreeBSD.org .
Index: head/sys/amd64/amd64/bpf_jit_machdep.c
===================================================================
--- head/sys/amd64/amd64/bpf_jit_machdep.c	(revision 335067)
+++ head/sys/amd64/amd64/bpf_jit_machdep.c	(revision 335068)
@@ -1,671 +1,653 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
  * Copyright (C) 2005-2017 Jung-uk Kim <jkim@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the Politecnico di Torino nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include "opt_bpf.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
-#include <vm/vm.h>
-#include <vm/vm_extern.h>
-#include <vm/vm_kern.h>
 #else
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/param.h>
 #endif
 
 #include <sys/types.h>
 
 #include <net/bpf.h>
 #include <net/bpf_jitter.h>
 
 #include <amd64/amd64/bpf_jit_machdep.h>
 
 /*
  * Emit routine to update the jump table.
  */
 static void
 emit_length(bpf_bin_stream *stream, __unused u_int value, u_int len)
 {
 
 	if (stream->refs != NULL)
 		(stream->refs)[stream->bpf_pc] += len;
 	stream->cur_ip += len;
 }
 
 /*
  * Emit routine to output the actual binary code.
  */
 static void
 emit_code(bpf_bin_stream *stream, u_int value, u_int len)
 {
 
 	switch (len) {
 	case 1:
 		stream->ibuf[stream->cur_ip] = (u_char)value;
 		stream->cur_ip++;
 		break;
 
 	case 2:
 		*((u_short *)(void *)(stream->ibuf + stream->cur_ip)) =
 		    (u_short)value;
 		stream->cur_ip += 2;
 		break;
 
 	case 4:
 		*((u_int *)(void *)(stream->ibuf + stream->cur_ip)) = value;
 		stream->cur_ip += 4;
 		break;
 	}
 
 	return;
 }
 
 /*
  * Scan the filter program and find possible optimization.
  */
 static int
 bpf_jit_optimize(struct bpf_insn *prog, u_int nins)
 {
 	int flags;
 	u_int i;
 
 	/* Do we return immediately? */
 	if (BPF_CLASS(prog[0].code) == BPF_RET)
 		return (BPF_JIT_FRET);
 
 	for (flags = 0, i = 0; i < nins; i++) {
 		switch (prog[i].code) {
 		case BPF_LD|BPF_W|BPF_ABS:
 		case BPF_LD|BPF_H|BPF_ABS:
 		case BPF_LD|BPF_B|BPF_ABS:
 		case BPF_LD|BPF_W|BPF_IND:
 		case BPF_LD|BPF_H|BPF_IND:
 		case BPF_LD|BPF_B|BPF_IND:
 		case BPF_LDX|BPF_MSH|BPF_B:
 			flags |= BPF_JIT_FPKT;
 			break;
 		case BPF_LD|BPF_MEM:
 		case BPF_LDX|BPF_MEM:
 		case BPF_ST:
 		case BPF_STX:
 			flags |= BPF_JIT_FMEM;
 			break;
 		case BPF_LD|BPF_W|BPF_LEN:
 		case BPF_LDX|BPF_W|BPF_LEN:
 			flags |= BPF_JIT_FLEN;
 			break;
 		case BPF_JMP|BPF_JA:
 		case BPF_JMP|BPF_JGT|BPF_K:
 		case BPF_JMP|BPF_JGE|BPF_K:
 		case BPF_JMP|BPF_JEQ|BPF_K:
 		case BPF_JMP|BPF_JSET|BPF_K:
 		case BPF_JMP|BPF_JGT|BPF_X:
 		case BPF_JMP|BPF_JGE|BPF_X:
 		case BPF_JMP|BPF_JEQ|BPF_X:
 		case BPF_JMP|BPF_JSET|BPF_X:
 			flags |= BPF_JIT_FJMP;
 			break;
 		}
 		if (flags == BPF_JIT_FLAG_ALL)
 			break;
 	}
 
 	return (flags);
 }
 
 /*
  * Function that does the real stuff.
  */
 bpf_filter_func
 bpf_jit_compile(struct bpf_insn *prog, u_int nins, size_t *size)
 {
 	bpf_bin_stream stream;
 	struct bpf_insn *ins;
 	int flags, fret, fpkt, fmem, fjmp, flen;
 	u_int i, pass;
 
 	/*
 	 * NOTE: Do not modify the name of this variable, as it's used by
 	 * the macros to emit code.
 	 */
 	emit_func emitm;
 
 	flags = bpf_jit_optimize(prog, nins);
 	fret = (flags & BPF_JIT_FRET) != 0;
 	fpkt = (flags & BPF_JIT_FPKT) != 0;
 	fmem = (flags & BPF_JIT_FMEM) != 0;
 	fjmp = (flags & BPF_JIT_FJMP) != 0;
 	flen = (flags & BPF_JIT_FLEN) != 0;
 
 	if (fret)
 		nins = 1;
 
 	memset(&stream, 0, sizeof(stream));
 
 	/* Allocate the reference table for the jumps. */
 	if (fjmp) {
 #ifdef _KERNEL
 		stream.refs = malloc((nins + 1) * sizeof(u_int), M_BPFJIT,
 		    M_NOWAIT | M_ZERO);
 #else
 		stream.refs = calloc(nins + 1, sizeof(u_int));
 #endif
 		if (stream.refs == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * The first pass will emit the lengths of the instructions
 	 * to create the reference table.
 	 */
 	emitm = emit_length;
 
 	for (pass = 0; pass < 2; pass++) {
 		ins = prog;
 
 		/* Create the procedure header. */
 		if (fmem) {
 			PUSH(RBP);
 			MOVrq(RSP, RBP);
 			SUBib(BPF_MEMWORDS * sizeof(uint32_t), RSP);
 		}
 		if (flen)
 			MOVrd2(ESI, R9D);
 		if (fpkt) {
 			MOVrq2(RDI, R8);
 			MOVrd(EDX, EDI);
 		}
 
 		for (i = 0; i < nins; i++) {
 			stream.bpf_pc++;
 
 			switch (ins->code) {
 			default:
 #ifdef _KERNEL
 				return (NULL);
 #else
 				abort();
 #endif
 
 			case BPF_RET|BPF_K:
 				MOVid(ins->k, EAX);
 				if (fmem)
 					LEAVE();
 				RET();
 				break;
 
 			case BPF_RET|BPF_A:
 				if (fmem)
 					LEAVE();
 				RET();
 				break;
 
 			case BPF_LD|BPF_W|BPF_ABS:
 				MOVid(ins->k, ESI);
 				CMPrd(EDI, ESI);
 				JAb(12);
 				MOVrd(EDI, ECX);
 				SUBrd(ESI, ECX);
 				CMPid(sizeof(int32_t), ECX);
 				if (fmem) {
 					JAEb(4);
 					ZEROrd(EAX);
 					LEAVE();
 				} else {
 					JAEb(3);
 					ZEROrd(EAX);
 				}
 				RET();
 				MOVrq3(R8, RCX);
 				MOVobd(RCX, RSI, EAX);
 				BSWAP(EAX);
 				break;
 
 			case BPF_LD|BPF_H|BPF_ABS:
 				ZEROrd(EAX);
 				MOVid(ins->k, ESI);
 				CMPrd(EDI, ESI);
 				JAb(12);
 				MOVrd(EDI, ECX);
 				SUBrd(ESI, ECX);
 				CMPid(sizeof(int16_t), ECX);
 				if (fmem) {
 					JAEb(2);
 					LEAVE();
 				} else
 					JAEb(1);
 				RET();
 				MOVrq3(R8, RCX);
 				MOVobw(RCX, RSI, AX);
 				SWAP_AX();
 				break;
 
 			case BPF_LD|BPF_B|BPF_ABS:
 				ZEROrd(EAX);
 				MOVid(ins->k, ESI);
 				CMPrd(EDI, ESI);
 				if (fmem) {
 					JBb(2);
 					LEAVE();
 				} else
 					JBb(1);
 				RET();
 				MOVrq3(R8, RCX);
 				MOVobb(RCX, RSI, AL);
 				break;
 
 			case BPF_LD|BPF_W|BPF_LEN:
 				MOVrd3(R9D, EAX);
 				break;
 
 			case BPF_LDX|BPF_W|BPF_LEN:
 				MOVrd3(R9D, EDX);
 				break;
 
 			case BPF_LD|BPF_W|BPF_IND:
 				CMPrd(EDI, EDX);
 				JAb(27);
 				MOVid(ins->k, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(EDX, ECX);
 				CMPrd(ESI, ECX);
 				JBb(14);
 				ADDrd(EDX, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(ESI, ECX);
 				CMPid(sizeof(int32_t), ECX);
 				if (fmem) {
 					JAEb(4);
 					ZEROrd(EAX);
 					LEAVE();
 				} else {
 					JAEb(3);
 					ZEROrd(EAX);
 				}
 				RET();
 				MOVrq3(R8, RCX);
 				MOVobd(RCX, RSI, EAX);
 				BSWAP(EAX);
 				break;
 
 			case BPF_LD|BPF_H|BPF_IND:
 				ZEROrd(EAX);
 				CMPrd(EDI, EDX);
 				JAb(27);
 				MOVid(ins->k, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(EDX, ECX);
 				CMPrd(ESI, ECX);
 				JBb(14);
 				ADDrd(EDX, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(ESI, ECX);
 				CMPid(sizeof(int16_t), ECX);
 				if (fmem) {
 					JAEb(2);
 					LEAVE();
 				} else
 					JAEb(1);
 				RET();
 				MOVrq3(R8, RCX);
 				MOVobw(RCX, RSI, AX);
 				SWAP_AX();
 				break;
 
 			case BPF_LD|BPF_B|BPF_IND:
 				ZEROrd(EAX);
 				CMPrd(EDI, EDX);
 				JAEb(13);
 				MOVid(ins->k, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(EDX, ECX);
 				CMPrd(ESI, ECX);
 				if (fmem) {
 					JAb(2);
 					LEAVE();
 				} else
 					JAb(1);
 				RET();
 				MOVrq3(R8, RCX);
 				ADDrd(EDX, ESI);
 				MOVobb(RCX, RSI, AL);
 				break;
 
 			case BPF_LDX|BPF_MSH|BPF_B:
 				MOVid(ins->k, ESI);
 				CMPrd(EDI, ESI);
 				if (fmem) {
 					JBb(4);
 					ZEROrd(EAX);
 					LEAVE();
 				} else {
 					JBb(3);
 					ZEROrd(EAX);
 				}
 				RET();
 				ZEROrd(EDX);
 				MOVrq3(R8, RCX);
 				MOVobb(RCX, RSI, DL);
 				ANDib(0x0f, DL);
 				SHLib(2, EDX);
 				break;
 
 			case BPF_LD|BPF_IMM:
 				MOVid(ins->k, EAX);
 				break;
 
 			case BPF_LDX|BPF_IMM:
 				MOVid(ins->k, EDX);
 				break;
 
 			case BPF_LD|BPF_MEM:
 				MOVid(ins->k * sizeof(uint32_t), ESI);
 				MOVobd(RSP, RSI, EAX);
 				break;
 
 			case BPF_LDX|BPF_MEM:
 				MOVid(ins->k * sizeof(uint32_t), ESI);
 				MOVobd(RSP, RSI, EDX);
 				break;
 
 			case BPF_ST:
 				/*
 				 * XXX this command and the following could
 				 * be optimized if the previous instruction
 				 * was already of this type
 				 */
 				MOVid(ins->k * sizeof(uint32_t), ESI);
 				MOVomd(EAX, RSP, RSI);
 				break;
 
 			case BPF_STX:
 				MOVid(ins->k * sizeof(uint32_t), ESI);
 				MOVomd(EDX, RSP, RSI);
 				break;
 
 			case BPF_JMP|BPF_JA:
 				JUMP(ins->k);
 				break;
 
 			case BPF_JMP|BPF_JGT|BPF_K:
 			case BPF_JMP|BPF_JGE|BPF_K:
 			case BPF_JMP|BPF_JEQ|BPF_K:
 			case BPF_JMP|BPF_JSET|BPF_K:
 			case BPF_JMP|BPF_JGT|BPF_X:
 			case BPF_JMP|BPF_JGE|BPF_X:
 			case BPF_JMP|BPF_JEQ|BPF_X:
 			case BPF_JMP|BPF_JSET|BPF_X:
 				if (ins->jt == ins->jf) {
 					JUMP(ins->jt);
 					break;
 				}
 				switch (ins->code) {
 				case BPF_JMP|BPF_JGT|BPF_K:
 					CMPid(ins->k, EAX);
 					JCC(JA, JBE);
 					break;
 
 				case BPF_JMP|BPF_JGE|BPF_K:
 					CMPid(ins->k, EAX);
 					JCC(JAE, JB);
 					break;
 
 				case BPF_JMP|BPF_JEQ|BPF_K:
 					CMPid(ins->k, EAX);
 					JCC(JE, JNE);
 					break;
 
 				case BPF_JMP|BPF_JSET|BPF_K:
 					TESTid(ins->k, EAX);
 					JCC(JNE, JE);
 					break;
 
 				case BPF_JMP|BPF_JGT|BPF_X:
 					CMPrd(EDX, EAX);
 					JCC(JA, JBE);
 					break;
 
 				case BPF_JMP|BPF_JGE|BPF_X:
 					CMPrd(EDX, EAX);
 					JCC(JAE, JB);
 					break;
 
 				case BPF_JMP|BPF_JEQ|BPF_X:
 					CMPrd(EDX, EAX);
 					JCC(JE, JNE);
 					break;
 
 				case BPF_JMP|BPF_JSET|BPF_X:
 					TESTrd(EDX, EAX);
 					JCC(JNE, JE);
 					break;
 				}
 				break;
 
 			case BPF_ALU|BPF_ADD|BPF_X:
 				ADDrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_SUB|BPF_X:
 				SUBrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_MUL|BPF_X:
 				MOVrd(EDX, ECX);
 				MULrd(EDX);
 				MOVrd(ECX, EDX);
 				break;
 
 			case BPF_ALU|BPF_DIV|BPF_X:
 			case BPF_ALU|BPF_MOD|BPF_X:
 				TESTrd(EDX, EDX);
 				if (fmem) {
 					JNEb(4);
 					ZEROrd(EAX);
 					LEAVE();
 				} else {
 					JNEb(3);
 					ZEROrd(EAX);
 				}
 				RET();
 				MOVrd(EDX, ECX);
 				ZEROrd(EDX);
 				DIVrd(ECX);
 				if (BPF_OP(ins->code) == BPF_MOD)
 					MOVrd(EDX, EAX);
 				MOVrd(ECX, EDX);
 				break;
 
 			case BPF_ALU|BPF_AND|BPF_X:
 				ANDrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_OR|BPF_X:
 				ORrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_XOR|BPF_X:
 				XORrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_LSH|BPF_X:
 				MOVrd(EDX, ECX);
 				SHL_CLrb(EAX);
 				break;
 
 			case BPF_ALU|BPF_RSH|BPF_X:
 				MOVrd(EDX, ECX);
 				SHR_CLrb(EAX);
 				break;
 
 			case BPF_ALU|BPF_ADD|BPF_K:
 				ADD_EAXi(ins->k);
 				break;
 
 			case BPF_ALU|BPF_SUB|BPF_K:
 				SUB_EAXi(ins->k);
 				break;
 
 			case BPF_ALU|BPF_MUL|BPF_K:
 				MOVrd(EDX, ECX);
 				MOVid(ins->k, EDX);
 				MULrd(EDX);
 				MOVrd(ECX, EDX);
 				break;
 
 			case BPF_ALU|BPF_DIV|BPF_K:
 			case BPF_ALU|BPF_MOD|BPF_K:
 				MOVrd(EDX, ECX);
 				ZEROrd(EDX);
 				MOVid(ins->k, ESI);
 				DIVrd(ESI);
 				if (BPF_OP(ins->code) == BPF_MOD)
 					MOVrd(EDX, EAX);
 				MOVrd(ECX, EDX);
 				break;
 
 			case BPF_ALU|BPF_AND|BPF_K:
 				ANDid(ins->k, EAX);
 				break;
 
 			case BPF_ALU|BPF_OR|BPF_K:
 				ORid(ins->k, EAX);
 				break;
 
 			case BPF_ALU|BPF_XOR|BPF_K:
 				XORid(ins->k, EAX);
 				break;
 
 			case BPF_ALU|BPF_LSH|BPF_K:
 				SHLib((ins->k) & 0xff, EAX);
 				break;
 
 			case BPF_ALU|BPF_RSH|BPF_K:
 				SHRib((ins->k) & 0xff, EAX);
 				break;
 
 			case BPF_ALU|BPF_NEG:
 				NEGd(EAX);
 				break;
 
 			case BPF_MISC|BPF_TAX:
 				MOVrd(EAX, EDX);
 				break;
 
 			case BPF_MISC|BPF_TXA:
 				MOVrd(EDX, EAX);
 				break;
 			}
 			ins++;
 		}
 
 		if (pass > 0)
 			continue;
 
 		*size = stream.cur_ip;
 #ifdef _KERNEL
-		/*
-		 * We cannot use malloc(9) because DMAP is mapped as NX.
-		 */
-		stream.ibuf = (void *)kmem_malloc(kernel_arena, *size,
-		    M_NOWAIT);
+		stream.ibuf = malloc(*size, M_BPFJIT, M_EXEC | M_NOWAIT);
 		if (stream.ibuf == NULL)
 			break;
 #else
 		stream.ibuf = mmap(NULL, *size, PROT_READ | PROT_WRITE,
 		    MAP_ANON, -1, 0);
 		if (stream.ibuf == MAP_FAILED) {
 			stream.ibuf = NULL;
 			break;
 		}
 #endif
 
 		/*
 		 * Modify the reference table to contain the offsets and
 		 * not the lengths of the instructions.
 		 */
 		if (fjmp)
 			for (i = 1; i < nins + 1; i++)
 				stream.refs[i] += stream.refs[i - 1];
 
 		/* Reset the counters. */
 		stream.cur_ip = 0;
 		stream.bpf_pc = 0;
 
 		/* The second pass creates the actual code. */
 		emitm = emit_code;
 	}
 
 	/*
 	 * The reference table is needed only during compilation,
 	 * now we can free it.
 	 */
 	if (fjmp)
 #ifdef _KERNEL
 		free(stream.refs, M_BPFJIT);
 #else
 		free(stream.refs);
 #endif
 
 #ifndef _KERNEL
 	if (stream.ibuf != NULL &&
 	    mprotect(stream.ibuf, *size, PROT_READ | PROT_EXEC) != 0) {
 		munmap(stream.ibuf, *size);
 		stream.ibuf = NULL;
 	}
 #endif
 
 	return ((bpf_filter_func)(void *)stream.ibuf);
-}
-
-void
-bpf_jit_free(void *func, size_t size)
-{
-
-#ifdef _KERNEL
-	kmem_free(kernel_arena, (vm_offset_t)func, size);
-#else
-	munmap(func, size);
-#endif
 }
Index: head/sys/i386/i386/bpf_jit_machdep.c
===================================================================
--- head/sys/i386/i386/bpf_jit_machdep.c	(revision 335067)
+++ head/sys/i386/i386/bpf_jit_machdep.c	(revision 335068)
@@ -1,694 +1,683 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
  * Copyright (C) 2005-2017 Jung-uk Kim <jkim@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the Politecnico di Torino nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include "opt_bpf.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #else
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/param.h>
 #endif
 
 #include <sys/types.h>
 
 #include <net/bpf.h>
 #include <net/bpf_jitter.h>
 
 #include <i386/i386/bpf_jit_machdep.h>
 
 /*
  * Emit routine to update the jump table.
  */
 static void
 emit_length(bpf_bin_stream *stream, __unused u_int value, u_int len)
 {
 
 	if (stream->refs != NULL)
 		(stream->refs)[stream->bpf_pc] += len;
 	stream->cur_ip += len;
 }
 
 /*
  * Emit routine to output the actual binary code.
  */
 static void
 emit_code(bpf_bin_stream *stream, u_int value, u_int len)
 {
 
 	switch (len) {
 	case 1:
 		stream->ibuf[stream->cur_ip] = (u_char)value;
 		stream->cur_ip++;
 		break;
 
 	case 2:
 		*((u_short *)(void *)(stream->ibuf + stream->cur_ip)) =
 		    (u_short)value;
 		stream->cur_ip += 2;
 		break;
 
 	case 4:
 		*((u_int *)(void *)(stream->ibuf + stream->cur_ip)) = value;
 		stream->cur_ip += 4;
 		break;
 	}
 
 	return;
 }
 
 /*
  * Scan the filter program and find possible optimization.
  */
 static int
 bpf_jit_optimize(struct bpf_insn *prog, u_int nins)
 {
 	int flags;
 	u_int i;
 
 	/* Do we return immediately? */
 	if (BPF_CLASS(prog[0].code) == BPF_RET)
 		return (BPF_JIT_FRET);
 
 	for (flags = 0, i = 0; i < nins; i++) {
 		switch (prog[i].code) {
 		case BPF_LD|BPF_W|BPF_ABS:
 		case BPF_LD|BPF_H|BPF_ABS:
 		case BPF_LD|BPF_B|BPF_ABS:
 		case BPF_LD|BPF_W|BPF_IND:
 		case BPF_LD|BPF_H|BPF_IND:
 		case BPF_LD|BPF_B|BPF_IND:
 		case BPF_LDX|BPF_MSH|BPF_B:
 			flags |= BPF_JIT_FPKT;
 			break;
 		case BPF_LD|BPF_MEM:
 		case BPF_LDX|BPF_MEM:
 		case BPF_ST:
 		case BPF_STX:
 			flags |= BPF_JIT_FMEM;
 			break;
 		case BPF_JMP|BPF_JA:
 		case BPF_JMP|BPF_JGT|BPF_K:
 		case BPF_JMP|BPF_JGE|BPF_K:
 		case BPF_JMP|BPF_JEQ|BPF_K:
 		case BPF_JMP|BPF_JSET|BPF_K:
 		case BPF_JMP|BPF_JGT|BPF_X:
 		case BPF_JMP|BPF_JGE|BPF_X:
 		case BPF_JMP|BPF_JEQ|BPF_X:
 		case BPF_JMP|BPF_JSET|BPF_X:
 			flags |= BPF_JIT_FJMP;
 			break;
 		case BPF_ALU|BPF_DIV|BPF_K:
 		case BPF_ALU|BPF_MOD|BPF_K:
 			flags |= BPF_JIT_FADK;
 			break;
 		}
 		if (flags == BPF_JIT_FLAG_ALL)
 			break;
 	}
 
 	return (flags);
 }
 
 /*
  * Function that does the real stuff.
  */
 bpf_filter_func
 bpf_jit_compile(struct bpf_insn *prog, u_int nins, size_t *size)
 {
 	bpf_bin_stream stream;
 	struct bpf_insn *ins;
 	int flags, fret, fpkt, fmem, fjmp, fadk;
 	int save_esp;
 	u_int i, pass;
 
 	/*
 	 * NOTE: Do not modify the name of this variable, as it's used by
 	 * the macros to emit code.
 	 */
 	emit_func emitm;
 
 	flags = bpf_jit_optimize(prog, nins);
 	fret = (flags & BPF_JIT_FRET) != 0;
 	fpkt = (flags & BPF_JIT_FPKT) != 0;
 	fmem = (flags & BPF_JIT_FMEM) != 0;
 	fjmp = (flags & BPF_JIT_FJMP) != 0;
 	fadk = (flags & BPF_JIT_FADK) != 0;
 	save_esp = (fpkt || fmem || fadk);	/* Stack is used. */
 
 	if (fret)
 		nins = 1;
 
 	memset(&stream, 0, sizeof(stream));
 
 	/* Allocate the reference table for the jumps. */
 	if (fjmp) {
 #ifdef _KERNEL
 		stream.refs = malloc((nins + 1) * sizeof(u_int), M_BPFJIT,
 		    M_NOWAIT | M_ZERO);
 #else
 		stream.refs = calloc(nins + 1, sizeof(u_int));
 #endif
 		if (stream.refs == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * The first pass will emit the lengths of the instructions
 	 * to create the reference table.
 	 */
 	emitm = emit_length;
 
 	for (pass = 0; pass < 2; pass++) {
 		ins = prog;
 
 		/* Create the procedure header. */
 		if (save_esp) {
 			PUSH(EBP);
 			MOVrd(ESP, EBP);
 		}
 		if (fmem)
 			SUBib(BPF_MEMWORDS * sizeof(uint32_t), ESP);
 		if (save_esp)
 			PUSH(ESI);
 		if (fpkt) {
 			PUSH(EDI);
 			PUSH(EBX);
 			MOVodd(8, EBP, EBX);
 			MOVodd(16, EBP, EDI);
 		}
 
 		for (i = 0; i < nins; i++) {
 			stream.bpf_pc++;
 
 			switch (ins->code) {
 			default:
 #ifdef _KERNEL
 				return (NULL);
 #else
 				abort();
 #endif
 
 			case BPF_RET|BPF_K:
 				MOVid(ins->k, EAX);
 				if (save_esp) {
 					if (fpkt) {
 						POP(EBX);
 						POP(EDI);
 					}
 					POP(ESI);
 					LEAVE();
 				}
 				RET();
 				break;
 
 			case BPF_RET|BPF_A:
 				if (save_esp) {
 					if (fpkt) {
 						POP(EBX);
 						POP(EDI);
 					}
 					POP(ESI);
 					LEAVE();
 				}
 				RET();
 				break;
 
 			case BPF_LD|BPF_W|BPF_ABS:
 				MOVid(ins->k, ESI);
 				CMPrd(EDI, ESI);
 				JAb(12);
 				MOVrd(EDI, ECX);
 				SUBrd(ESI, ECX);
 				CMPid(sizeof(int32_t), ECX);
 				JAEb(7);
 				ZEROrd(EAX);
 				POP(EBX);
 				POP(EDI);
 				POP(ESI);
 				LEAVE();
 				RET();
 				MOVobd(EBX, ESI, EAX);
 				BSWAP(EAX);
 				break;
 
 			case BPF_LD|BPF_H|BPF_ABS:
 				ZEROrd(EAX);
 				MOVid(ins->k, ESI);
 				CMPrd(EDI, ESI);
 				JAb(12);
 				MOVrd(EDI, ECX);
 				SUBrd(ESI, ECX);
 				CMPid(sizeof(int16_t), ECX);
 				JAEb(5);
 				POP(EBX);
 				POP(EDI);
 				POP(ESI);
 				LEAVE();
 				RET();
 				MOVobw(EBX, ESI, AX);
 				SWAP_AX();
 				break;
 
 			case BPF_LD|BPF_B|BPF_ABS:
 				ZEROrd(EAX);
 				MOVid(ins->k, ESI);
 				CMPrd(EDI, ESI);
 				JBb(5);
 				POP(EBX);
 				POP(EDI);
 				POP(ESI);
 				LEAVE();
 				RET();
 				MOVobb(EBX, ESI, AL);
 				break;
 
 			case BPF_LD|BPF_W|BPF_LEN:
 				if (save_esp)
 					MOVodd(12, EBP, EAX);
 				else {
 					MOVrd(ESP, ECX);
 					MOVodd(12, ECX, EAX);
 				}
 				break;
 
 			case BPF_LDX|BPF_W|BPF_LEN:
 				if (save_esp)
 					MOVodd(12, EBP, EDX);
 				else {
 					MOVrd(ESP, ECX);
 					MOVodd(12, ECX, EDX);
 				}
 				break;
 
 			case BPF_LD|BPF_W|BPF_IND:
 				CMPrd(EDI, EDX);
 				JAb(27);
 				MOVid(ins->k, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(EDX, ECX);
 				CMPrd(ESI, ECX);
 				JBb(14);
 				ADDrd(EDX, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(ESI, ECX);
 				CMPid(sizeof(int32_t), ECX);
 				JAEb(7);
 				ZEROrd(EAX);
 				POP(EBX);
 				POP(EDI);
 				POP(ESI);
 				LEAVE();
 				RET();
 				MOVobd(EBX, ESI, EAX);
 				BSWAP(EAX);
 				break;
 
 			case BPF_LD|BPF_H|BPF_IND:
 				ZEROrd(EAX);
 				CMPrd(EDI, EDX);
 				JAb(27);
 				MOVid(ins->k, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(EDX, ECX);
 				CMPrd(ESI, ECX);
 				JBb(14);
 				ADDrd(EDX, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(ESI, ECX);
 				CMPid(sizeof(int16_t), ECX);
 				JAEb(5);
 				POP(EBX);
 				POP(EDI);
 				POP(ESI);
 				LEAVE();
 				RET();
 				MOVobw(EBX, ESI, AX);
 				SWAP_AX();
 				break;
 
 			case BPF_LD|BPF_B|BPF_IND:
 				ZEROrd(EAX);
 				CMPrd(EDI, EDX);
 				JAEb(13);
 				MOVid(ins->k, ESI);
 				MOVrd(EDI, ECX);
 				SUBrd(EDX, ECX);
 				CMPrd(ESI, ECX);
 				JAb(5);
 				POP(EBX);
 				POP(EDI);
 				POP(ESI);
 				LEAVE();
 				RET();
 				ADDrd(EDX, ESI);
 				MOVobb(EBX, ESI, AL);
 				break;
 
 			case BPF_LDX|BPF_MSH|BPF_B:
 				MOVid(ins->k, ESI);
 				CMPrd(EDI, ESI);
 				JBb(7);
 				ZEROrd(EAX);
 				POP(EBX);
 				POP(EDI);
 				POP(ESI);
 				LEAVE();
 				RET();
 				ZEROrd(EDX);
 				MOVobb(EBX, ESI, DL);
 				ANDib(0x0f, DL);
 				SHLib(2, EDX);
 				break;
 
 			case BPF_LD|BPF_IMM:
 				MOVid(ins->k, EAX);
 				break;
 
 			case BPF_LDX|BPF_IMM:
 				MOVid(ins->k, EDX);
 				break;
 
 			case BPF_LD|BPF_MEM:
 				MOVrd(EBP, ECX);
 				MOVid(((int)ins->k - BPF_MEMWORDS) *
 				    sizeof(uint32_t), ESI);
 				MOVobd(ECX, ESI, EAX);
 				break;
 
 			case BPF_LDX|BPF_MEM:
 				MOVrd(EBP, ECX);
 				MOVid(((int)ins->k - BPF_MEMWORDS) *
 				    sizeof(uint32_t), ESI);
 				MOVobd(ECX, ESI, EDX);
 				break;
 
 			case BPF_ST:
 				/*
 				 * XXX this command and the following could
 				 * be optimized if the previous instruction
 				 * was already of this type
 				 */
 				MOVrd(EBP, ECX);
 				MOVid(((int)ins->k - BPF_MEMWORDS) *
 				    sizeof(uint32_t), ESI);
 				MOVomd(EAX, ECX, ESI);
 				break;
 
 			case BPF_STX:
 				MOVrd(EBP, ECX);
 				MOVid(((int)ins->k - BPF_MEMWORDS) *
 				    sizeof(uint32_t), ESI);
 				MOVomd(EDX, ECX, ESI);
 				break;
 
 			case BPF_JMP|BPF_JA:
 				JUMP(ins->k);
 				break;
 
 			case BPF_JMP|BPF_JGT|BPF_K:
 			case BPF_JMP|BPF_JGE|BPF_K:
 			case BPF_JMP|BPF_JEQ|BPF_K:
 			case BPF_JMP|BPF_JSET|BPF_K:
 			case BPF_JMP|BPF_JGT|BPF_X:
 			case BPF_JMP|BPF_JGE|BPF_X:
 			case BPF_JMP|BPF_JEQ|BPF_X:
 			case BPF_JMP|BPF_JSET|BPF_X:
 				if (ins->jt == ins->jf) {
 					JUMP(ins->jt);
 					break;
 				}
 				switch (ins->code) {
 				case BPF_JMP|BPF_JGT|BPF_K:
 					CMPid(ins->k, EAX);
 					JCC(JA, JBE);
 					break;
 
 				case BPF_JMP|BPF_JGE|BPF_K:
 					CMPid(ins->k, EAX);
 					JCC(JAE, JB);
 					break;
 
 				case BPF_JMP|BPF_JEQ|BPF_K:
 					CMPid(ins->k, EAX);
 					JCC(JE, JNE);
 					break;
 
 				case BPF_JMP|BPF_JSET|BPF_K:
 					TESTid(ins->k, EAX);
 					JCC(JNE, JE);
 					break;
 
 				case BPF_JMP|BPF_JGT|BPF_X:
 					CMPrd(EDX, EAX);
 					JCC(JA, JBE);
 					break;
 
 				case BPF_JMP|BPF_JGE|BPF_X:
 					CMPrd(EDX, EAX);
 					JCC(JAE, JB);
 					break;
 
 				case BPF_JMP|BPF_JEQ|BPF_X:
 					CMPrd(EDX, EAX);
 					JCC(JE, JNE);
 					break;
 
 				case BPF_JMP|BPF_JSET|BPF_X:
 					TESTrd(EDX, EAX);
 					JCC(JNE, JE);
 					break;
 				}
 				break;
 
 			case BPF_ALU|BPF_ADD|BPF_X:
 				ADDrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_SUB|BPF_X:
 				SUBrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_MUL|BPF_X:
 				MOVrd(EDX, ECX);
 				MULrd(EDX);
 				MOVrd(ECX, EDX);
 				break;
 
 			case BPF_ALU|BPF_DIV|BPF_X:
 			case BPF_ALU|BPF_MOD|BPF_X:
 				TESTrd(EDX, EDX);
 				if (save_esp) {
 					if (fpkt) {
 						JNEb(7);
 						ZEROrd(EAX);
 						POP(EBX);
 						POP(EDI);
 					} else {
 						JNEb(5);
 						ZEROrd(EAX);
 					}
 					POP(ESI);
 					LEAVE();
 				} else {
 					JNEb(3);
 					ZEROrd(EAX);
 				}
 				RET();
 				MOVrd(EDX, ECX);
 				ZEROrd(EDX);
 				DIVrd(ECX);
 				if (BPF_OP(ins->code) == BPF_MOD)
 					MOVrd(EDX, EAX);
 				MOVrd(ECX, EDX);
 				break;
 
 			case BPF_ALU|BPF_AND|BPF_X:
 				ANDrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_OR|BPF_X:
 				ORrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_XOR|BPF_X:
 				XORrd(EDX, EAX);
 				break;
 
 			case BPF_ALU|BPF_LSH|BPF_X:
 				MOVrd(EDX, ECX);
 				SHL_CLrb(EAX);
 				break;
 
 			case BPF_ALU|BPF_RSH|BPF_X:
 				MOVrd(EDX, ECX);
 				SHR_CLrb(EAX);
 				break;
 
 			case BPF_ALU|BPF_ADD|BPF_K:
 				ADD_EAXi(ins->k);
 				break;
 
 			case BPF_ALU|BPF_SUB|BPF_K:
 				SUB_EAXi(ins->k);
 				break;
 
 			case BPF_ALU|BPF_MUL|BPF_K:
 				MOVrd(EDX, ECX);
 				MOVid(ins->k, EDX);
 				MULrd(EDX);
 				MOVrd(ECX, EDX);
 				break;
 
 			case BPF_ALU|BPF_DIV|BPF_K:
 			case BPF_ALU|BPF_MOD|BPF_K:
 				MOVrd(EDX, ECX);
 				ZEROrd(EDX);
 				MOVid(ins->k, ESI);
 				DIVrd(ESI);
 				if (BPF_OP(ins->code) == BPF_MOD)
 					MOVrd(EDX, EAX);
 				MOVrd(ECX, EDX);
 				break;
 
 			case BPF_ALU|BPF_AND|BPF_K:
 				ANDid(ins->k, EAX);
 				break;
 
 			case BPF_ALU|BPF_OR|BPF_K:
 				ORid(ins->k, EAX);
 				break;
 
 			case BPF_ALU|BPF_XOR|BPF_K:
 				XORid(ins->k, EAX);
 				break;
 
 			case BPF_ALU|BPF_LSH|BPF_K:
 				SHLib((ins->k) & 0xff, EAX);
 				break;
 
 			case BPF_ALU|BPF_RSH|BPF_K:
 				SHRib((ins->k) & 0xff, EAX);
 				break;
 
 			case BPF_ALU|BPF_NEG:
 				NEGd(EAX);
 				break;
 
 			case BPF_MISC|BPF_TAX:
 				MOVrd(EAX, EDX);
 				break;
 
 			case BPF_MISC|BPF_TXA:
 				MOVrd(EDX, EAX);
 				break;
 			}
 			ins++;
 		}
 
 		if (pass > 0)
 			continue;
 
 		*size = stream.cur_ip;
 #ifdef _KERNEL
-		stream.ibuf = malloc(*size, M_BPFJIT, M_NOWAIT);
+		stream.ibuf = malloc(*size, M_BPFJIT, M_EXEC | M_NOWAIT);
 		if (stream.ibuf == NULL)
 			break;
 #else
 		stream.ibuf = mmap(NULL, *size, PROT_READ | PROT_WRITE,
 		    MAP_ANON, -1, 0);
 		if (stream.ibuf == MAP_FAILED) {
 			stream.ibuf = NULL;
 			break;
 		}
 #endif
 
 		/*
 		 * Modify the reference table to contain the offsets and
 		 * not the lengths of the instructions.
 		 */
 		if (fjmp)
 			for (i = 1; i < nins + 1; i++)
 				stream.refs[i] += stream.refs[i - 1];
 
 		/* Reset the counters. */
 		stream.cur_ip = 0;
 		stream.bpf_pc = 0;
 
 		/* The second pass creates the actual code. */
 		emitm = emit_code;
 	}
 
 	/*
 	 * The reference table is needed only during compilation,
 	 * now we can free it.
 	 */
 	if (fjmp)
 #ifdef _KERNEL
 		free(stream.refs, M_BPFJIT);
 #else
 		free(stream.refs);
 #endif
 
 #ifndef _KERNEL
 	if (stream.ibuf != NULL &&
 	    mprotect(stream.ibuf, *size, PROT_READ | PROT_EXEC) != 0) {
 		munmap(stream.ibuf, *size);
 		stream.ibuf = NULL;
 	}
 #endif
 
 	return ((bpf_filter_func)(void *)stream.ibuf);
-}
-
-void
-bpf_jit_free(void *func, size_t size)
-{
-
-#ifdef _KERNEL
-	free(func, M_BPFJIT);
-#else
-	munmap(func, size);
-#endif
 }
Index: head/sys/kern/kern_malloc.c
===================================================================
--- head/sys/kern/kern_malloc.c	(revision 335067)
+++ head/sys/kern/kern_malloc.c	(revision 335068)
@@ -1,1278 +1,1278 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1987, 1991, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005-2009 Robert N. M. Watson
  * Copyright (c) 2008 Otto Moerbeek <otto@drijf.net> (mallocarray)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
  */
 
 /*
  * Kernel malloc(9) implementation -- general purpose kernel memory allocator
  * based on memory types.  Back end is implemented using the UMA(9) zone
  * allocator.  A set of fixed-size buckets are used for smaller allocations,
  * and a special UMA allocation interface is used for larger allocations.
  * Callers declare memory types, and statistics are maintained independently
  * for each memory type.  Statistics are maintained per-CPU for performance
  * reasons.  See malloc(9) and comments in malloc.h for a detailed
  * description.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/vmem.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
 #ifdef DEBUG_REDZONE
 #include <vm/redzone.h>
 #endif
 
 #if defined(INVARIANTS) && defined(__i386__)
 #include <machine/cpu.h>
 #endif
 
 #include <ddb/ddb.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 bool	__read_frequently			dtrace_malloc_enabled;
 dtrace_malloc_probe_func_t __read_mostly	dtrace_malloc_probe;
 #endif
 
 #if defined(INVARIANTS) || defined(MALLOC_MAKE_FAILURES) ||		\
     defined(DEBUG_MEMGUARD) || defined(DEBUG_REDZONE)
 #define	MALLOC_DEBUG	1
 #endif
 
 /*
  * When realloc() is called, if the new size is sufficiently smaller than
  * the old size, realloc() will allocate a new, smaller block to avoid
  * wasting memory. 'Sufficiently smaller' is defined as: newsize <=
  * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'.
  */
 #ifndef REALLOC_FRACTION
 #define	REALLOC_FRACTION	1	/* new block if <= half the size */
 #endif
 
 /*
  * Centrally define some common malloc types.
  */
 MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
 MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
 MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
 
 static struct malloc_type *kmemstatistics;
 static int kmemcount;
 
 #define KMEM_ZSHIFT	4
 #define KMEM_ZBASE	16
 #define KMEM_ZMASK	(KMEM_ZBASE - 1)
 
 #define KMEM_ZMAX	65536
 #define KMEM_ZSIZE	(KMEM_ZMAX >> KMEM_ZSHIFT)
 static uint8_t kmemsize[KMEM_ZSIZE + 1];
 
 #ifndef MALLOC_DEBUG_MAXZONES
 #define	MALLOC_DEBUG_MAXZONES	1
 #endif
 static int numzones = MALLOC_DEBUG_MAXZONES;
 
 /*
  * Small malloc(9) memory allocations are allocated from a set of UMA buckets
  * of various sizes.
  *
  * XXX: The comment here used to read "These won't be powers of two for
  * long."  It's possible that a significant amount of wasted memory could be
  * recovered by tuning the sizes of these buckets.
  */
 struct {
 	int kz_size;
 	char *kz_name;
 	uma_zone_t kz_zone[MALLOC_DEBUG_MAXZONES];
 } kmemzones[] = {
 	{16, "16", },
 	{32, "32", },
 	{64, "64", },
 	{128, "128", },
 	{256, "256", },
 	{512, "512", },
 	{1024, "1024", },
 	{2048, "2048", },
 	{4096, "4096", },
 	{8192, "8192", },
 	{16384, "16384", },
 	{32768, "32768", },
 	{65536, "65536", },
 	{0, NULL},
 };
 
 /*
  * Zone to allocate malloc type descriptions from.  For ABI reasons, memory
  * types are described by a data structure passed by the declaring code, but
  * the malloc(9) implementation has its own data structure describing the
  * type and statistics.  This permits the malloc(9)-internal data structures
  * to be modified without breaking binary-compiled kernel modules that
  * declare malloc types.
  */
 static uma_zone_t mt_zone;
 
 u_long vm_kmem_size;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RDTUN, &vm_kmem_size, 0,
     "Size of kernel memory");
 
 static u_long kmem_zmax = KMEM_ZMAX;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_zmax, CTLFLAG_RDTUN, &kmem_zmax, 0,
     "Maximum allocation size that malloc(9) would use UMA as backend");
 
 static u_long vm_kmem_size_min;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RDTUN, &vm_kmem_size_min, 0,
     "Minimum size of kernel memory");
 
 static u_long vm_kmem_size_max;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RDTUN, &vm_kmem_size_max, 0,
     "Maximum size of kernel memory");
 
 static u_int vm_kmem_size_scale;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size_scale, CTLFLAG_RDTUN, &vm_kmem_size_scale, 0,
     "Scale factor for kernel memory size");
 
 static int sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, kmem_map_size,
     CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_kmem_map_size, "LU", "Current kmem allocation size");
 
 static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, kmem_map_free,
     CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_kmem_map_free, "LU", "Free space in kmem");
 
 /*
  * The malloc_mtx protects the kmemstatistics linked list.
  */
 struct mtx malloc_mtx;
 
 #ifdef MALLOC_PROFILE
 uint64_t krequests[KMEM_ZSIZE + 1];
 
 static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
 #endif
 
 static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS);
 
 /*
  * time_uptime of the last malloc(9) failure (induced or real).
  */
 static time_t t_malloc_fail;
 
 #if defined(MALLOC_MAKE_FAILURES) || (MALLOC_DEBUG_MAXZONES > 1)
 static SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0,
     "Kernel malloc debugging options");
 #endif
 
 /*
  * malloc(9) fault injection -- cause malloc failures every (n) mallocs when
  * the caller specifies M_NOWAIT.  If set to 0, no failures are caused.
  */
 #ifdef MALLOC_MAKE_FAILURES
 static int malloc_failure_rate;
 static int malloc_nowait_count;
 static int malloc_failure_count;
 SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RWTUN,
     &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail");
 SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD,
     &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures");
 #endif
 
 static int
 sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS)
 {
 	u_long size;
 
 	size = uma_size();
 	return (sysctl_handle_long(oidp, &size, 0, req));
 }
 
 static int
 sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS)
 {
 	u_long size, limit;
 
 	/* The sysctl is unsigned, implement as a saturation value. */
 	size = uma_size();
 	limit = uma_limit();
 	if (size > limit)
 		size = 0;
 	else
 		size = limit - size;
 	return (sysctl_handle_long(oidp, &size, 0, req));
 }
 
 /*
  * malloc(9) uma zone separation -- sub-page buffer overruns in one
  * malloc type will affect only a subset of other malloc types.
  */
 #if MALLOC_DEBUG_MAXZONES > 1
 static void
 tunable_set_numzones(void)
 {
 
 	TUNABLE_INT_FETCH("debug.malloc.numzones",
 	    &numzones);
 
 	/* Sanity check the number of malloc uma zones. */
 	if (numzones <= 0)
 		numzones = 1;
 	if (numzones > MALLOC_DEBUG_MAXZONES)
 		numzones = MALLOC_DEBUG_MAXZONES;
 }
 SYSINIT(numzones, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_set_numzones, NULL);
 SYSCTL_INT(_debug_malloc, OID_AUTO, numzones, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &numzones, 0, "Number of malloc uma subzones");
 
 /*
  * Any number that changes regularly is an okay choice for the
  * offset.  Build numbers are pretty good of you have them.
  */
 static u_int zone_offset = __FreeBSD_version;
 TUNABLE_INT("debug.malloc.zone_offset", &zone_offset);
 SYSCTL_UINT(_debug_malloc, OID_AUTO, zone_offset, CTLFLAG_RDTUN,
     &zone_offset, 0, "Separate malloc types by examining the "
     "Nth character in the malloc type short description.");
 
 static void
 mtp_set_subzone(struct malloc_type *mtp)
 {
 	struct malloc_type_internal *mtip;
 	const char *desc;
 	size_t len;
 	u_int val;
 
 	mtip = mtp->ks_handle;
 	desc = mtp->ks_shortdesc;
 	if (desc == NULL || (len = strlen(desc)) == 0)
 		val = 0;
 	else
 		val = desc[zone_offset % len];
 	mtip->mti_zone = (val % numzones);
 }
 
 static inline u_int
 mtp_get_subzone(struct malloc_type *mtp)
 {
 	struct malloc_type_internal *mtip;
 
 	mtip = mtp->ks_handle;
 
 	KASSERT(mtip->mti_zone < numzones,
 	    ("mti_zone %u out of range %d",
 	    mtip->mti_zone, numzones));
 	return (mtip->mti_zone);
 }
 #elif MALLOC_DEBUG_MAXZONES == 0
 #error "MALLOC_DEBUG_MAXZONES must be positive."
 #else
 static void
 mtp_set_subzone(struct malloc_type *mtp)
 {
 	struct malloc_type_internal *mtip;
 
 	mtip = mtp->ks_handle;
 	mtip->mti_zone = 0;
 }
 
 static inline u_int
 mtp_get_subzone(struct malloc_type *mtp)
 {
 
 	return (0);
 }
 #endif /* MALLOC_DEBUG_MAXZONES > 1 */
 
 int
 malloc_last_fail(void)
 {
 
 	return (time_uptime - t_malloc_fail);
 }
 
 /*
  * An allocation has succeeded -- update malloc type statistics for the
  * amount of bucket size.  Occurs within a critical section so that the
  * thread isn't preempted and doesn't migrate while updating per-PCU
  * statistics.
  */
 static void
 malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size,
     int zindx)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 
 	critical_enter();
 	mtip = mtp->ks_handle;
 	mtsp = &mtip->mti_stats[curcpu];
 	if (size > 0) {
 		mtsp->mts_memalloced += size;
 		mtsp->mts_numallocs++;
 	}
 	if (zindx != -1)
 		mtsp->mts_size |= 1 << zindx;
 
 #ifdef KDTRACE_HOOKS
 	if (__predict_false(dtrace_malloc_enabled)) {
 		uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_MALLOC];
 		if (probe_id != 0)
 			(dtrace_malloc_probe)(probe_id,
 			    (uintptr_t) mtp, (uintptr_t) mtip,
 			    (uintptr_t) mtsp, size, zindx);
 	}
 #endif
 
 	critical_exit();
 }
 
 void
 malloc_type_allocated(struct malloc_type *mtp, unsigned long size)
 {
 
 	if (size > 0)
 		malloc_type_zone_allocated(mtp, size, -1);
 }
 
 /*
  * A free operation has occurred -- update malloc type statistics for the
  * amount of the bucket size.  Occurs within a critical section so that the
  * thread isn't preempted and doesn't migrate while updating per-CPU
  * statistics.
  */
 void
 malloc_type_freed(struct malloc_type *mtp, unsigned long size)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 
 	critical_enter();
 	mtip = mtp->ks_handle;
 	mtsp = &mtip->mti_stats[curcpu];
 	mtsp->mts_memfreed += size;
 	mtsp->mts_numfrees++;
 
 #ifdef KDTRACE_HOOKS
 	if (__predict_false(dtrace_malloc_enabled)) {
 		uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_FREE];
 		if (probe_id != 0)
 			(dtrace_malloc_probe)(probe_id,
 			    (uintptr_t) mtp, (uintptr_t) mtip,
 			    (uintptr_t) mtsp, size, 0);
 	}
 #endif
 
 	critical_exit();
 }
 
 /*
  *	contigmalloc:
  *
  *	Allocate a block of physically contiguous memory.
  *
  *	If M_NOWAIT is set, this routine will not block and return NULL if
  *	the allocation fails.
  */
 void *
 contigmalloc(unsigned long size, struct malloc_type *type, int flags,
     vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
     vm_paddr_t boundary)
 {
 	void *ret;
 
 	ret = (void *)kmem_alloc_contig(kernel_arena, size, flags, low, high,
 	    alignment, boundary, VM_MEMATTR_DEFAULT);
 	if (ret != NULL)
 		malloc_type_allocated(type, round_page(size));
 	return (ret);
 }
 
 void *
 contigmalloc_domain(unsigned long size, struct malloc_type *type,
     int domain, int flags, vm_paddr_t low, vm_paddr_t high,
     unsigned long alignment, vm_paddr_t boundary)
 {
 	void *ret;
 
 	ret = (void *)kmem_alloc_contig_domain(domain, size, flags, low, high,
 	    alignment, boundary, VM_MEMATTR_DEFAULT);
 	if (ret != NULL)
 		malloc_type_allocated(type, round_page(size));
 	return (ret);
 }
 
 /*
  *	contigfree:
  *
  *	Free a block of memory allocated by contigmalloc.
  *
  *	This routine may not block.
  */
 void
 contigfree(void *addr, unsigned long size, struct malloc_type *type)
 {
 
 	kmem_free(kernel_arena, (vm_offset_t)addr, size);
 	malloc_type_freed(type, round_page(size));
 }
 
 #ifdef MALLOC_DEBUG
 static int
 malloc_dbg(caddr_t *vap, size_t *sizep, struct malloc_type *mtp,
     int flags)
 {
 #ifdef INVARIANTS
 	int indx;
 
 	KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic"));
 	/*
 	 * Check that exactly one of M_WAITOK or M_NOWAIT is specified.
 	 */
 	indx = flags & (M_WAITOK | M_NOWAIT);
 	if (indx != M_NOWAIT && indx != M_WAITOK) {
 		static	struct timeval lasterr;
 		static	int curerr, once;
 		if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
 			printf("Bad malloc flags: %x\n", indx);
 			kdb_backtrace();
 			flags |= M_WAITOK;
 			once++;
 		}
 	}
 #endif
 #ifdef MALLOC_MAKE_FAILURES
 	if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) {
 		atomic_add_int(&malloc_nowait_count, 1);
 		if ((malloc_nowait_count % malloc_failure_rate) == 0) {
 			atomic_add_int(&malloc_failure_count, 1);
 			t_malloc_fail = time_uptime;
 			*vap = NULL;
 			return (EJUSTRETURN);
 		}
 	}
 #endif
 	if (flags & M_WAITOK) {
 		KASSERT(curthread->td_intr_nesting_level == 0,
 		   ("malloc(M_WAITOK) in interrupt context"));
 		KASSERT(curthread->td_epochnest == 0,
 			("malloc(M_WAITOK) in epoch context"));		
 	}
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("malloc: called with spinlock or critical section held"));
 
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp_mtp(mtp, *sizep)) {
 		*vap = memguard_alloc(*sizep, flags);
 		if (*vap != NULL)
 			return (EJUSTRETURN);
 		/* This is unfortunate but should not be fatal. */
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
 	*sizep = redzone_size_ntor(*sizep);
 #endif
 
 	return (0);
 }
 #endif
 
 /*
  *	malloc:
  *
  *	Allocate a block of memory.
  *
  *	If M_NOWAIT is set, this routine will not block and return NULL if
  *	the allocation fails.
  */
 void *
 (malloc)(size_t size, struct malloc_type *mtp, int flags)
 {
 	int indx;
 	caddr_t va;
 	uma_zone_t zone;
 #if defined(DEBUG_REDZONE)
 	unsigned long osize = size;
 #endif
 
 #ifdef MALLOC_DEBUG
 	va = NULL;
 	if (malloc_dbg(&va, &size, mtp, flags) != 0)
 		return (va);
 #endif
 
-	if (size <= kmem_zmax) {
+	if (size <= kmem_zmax && (flags & M_EXEC) == 0) {
 		if (size & KMEM_ZMASK)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		zone = kmemzones[indx].kz_zone[mtp_get_subzone(mtp)];
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
 		va = uma_zalloc(zone, flags);
 		if (va != NULL)
 			size = zone->uz_size;
 		malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
 		va = uma_large_malloc(size, flags);
 		malloc_type_allocated(mtp, va == NULL ? 0 : size);
 	}
 	if (flags & M_WAITOK)
 		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
 	else if (va == NULL)
 		t_malloc_fail = time_uptime;
 #ifdef DEBUG_REDZONE
 	if (va != NULL)
 		va = redzone_setup(va, osize);
 #endif
 	return ((void *) va);
 }
 
 void *
 malloc_domain(size_t size, struct malloc_type *mtp, int domain,
     int flags)
 {
 	int indx;
 	caddr_t va;
 	uma_zone_t zone;
 #if defined(DEBUG_REDZONE)
 	unsigned long osize = size;
 #endif
 
 #ifdef MALLOC_DEBUG
 	va = NULL;
 	if (malloc_dbg(&va, &size, mtp, flags) != 0)
 		return (va);
 #endif
-	if (size <= kmem_zmax) {
+	if (size <= kmem_zmax && (flags & M_EXEC) == 0) {
 		if (size & KMEM_ZMASK)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		zone = kmemzones[indx].kz_zone[mtp_get_subzone(mtp)];
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
 		va = uma_zalloc_domain(zone, NULL, domain, flags);
 		if (va != NULL)
 			size = zone->uz_size;
 		malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
 		va = uma_large_malloc_domain(size, domain, flags);
 		malloc_type_allocated(mtp, va == NULL ? 0 : size);
 	}
 	if (flags & M_WAITOK)
 		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
 	else if (va == NULL)
 		t_malloc_fail = time_uptime;
 #ifdef DEBUG_REDZONE
 	if (va != NULL)
 		va = redzone_setup(va, osize);
 #endif
 	return ((void *) va);
 }
 
 void *
 mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags)
 {
 
 	if (WOULD_OVERFLOW(nmemb, size))
 		panic("mallocarray: %zu * %zu overflowed", nmemb, size);
 
 	return (malloc(size * nmemb, type, flags));
 }
 
 #ifdef INVARIANTS
 static void
 free_save_type(void *addr, struct malloc_type *mtp, u_long size)
 {
 	struct malloc_type **mtpp = addr;
 
 	/*
 	 * Cache a pointer to the malloc_type that most recently freed
 	 * this memory here.  This way we know who is most likely to
 	 * have stepped on it later.
 	 *
 	 * This code assumes that size is a multiple of 8 bytes for
 	 * 64 bit machines
 	 */
 	mtpp = (struct malloc_type **) ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
 	mtpp += (size - sizeof(struct malloc_type *)) /
 	    sizeof(struct malloc_type *);
 	*mtpp = mtp;
 }
 #endif
 
 #ifdef MALLOC_DEBUG
 static int
 free_dbg(void **addrp, struct malloc_type *mtp)
 {
 	void *addr;
 
 	addr = *addrp;
 	KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic"));
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("free: called with spinlock or critical section held"));
 
 	/* free(NULL, ...) does nothing */
 	if (addr == NULL)
 		return (EJUSTRETURN);
 
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(addr)) {
 		memguard_free(addr);
 		return (EJUSTRETURN);
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
 	redzone_check(addr);
 	*addrp = redzone_addr_ntor(addr);
 #endif
 
 	return (0);
 }
 #endif
 
 /*
  *	free:
  *
  *	Free a block of memory allocated by malloc.
  *
  *	This routine may not block.
  */
 void
 free(void *addr, struct malloc_type *mtp)
 {
 	uma_slab_t slab;
 	u_long size;
 
 #ifdef MALLOC_DEBUG
 	if (free_dbg(&addr, mtp) != 0)
 		return;
 #endif
 	/* free(NULL, ...) does nothing */
 	if (addr == NULL)
 		return;
 
 	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
 	if (slab == NULL)
 		panic("free: address %p(%p) has not been allocated.\n",
 		    addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
 
 	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
 		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
 		free_save_type(addr, mtp, size);
 #endif
 		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);
 	}
 	malloc_type_freed(mtp, size);
 }
 
 void
 free_domain(void *addr, struct malloc_type *mtp)
 {
 	uma_slab_t slab;
 	u_long size;
 
 #ifdef MALLOC_DEBUG
 	if (free_dbg(&addr, mtp) != 0)
 		return;
 #endif
 
 	/* free(NULL, ...) does nothing */
 	if (addr == NULL)
 		return;
 
 	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
 	if (slab == NULL)
 		panic("free_domain: address %p(%p) has not been allocated.\n",
 		    addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
 
 	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
 		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
 		free_save_type(addr, mtp, size);
 #endif
 		uma_zfree_domain(LIST_FIRST(&slab->us_keg->uk_zones),
 		    addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);
 	}
 	malloc_type_freed(mtp, size);
 }
 
 /*
  *	realloc: change the size of a memory block
  */
 void *
 realloc(void *addr, size_t size, struct malloc_type *mtp, int flags)
 {
 	uma_slab_t slab;
 	unsigned long alloc;
 	void *newaddr;
 
 	KASSERT(mtp->ks_magic == M_MAGIC,
 	    ("realloc: bad malloc type magic"));
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("realloc: called with spinlock or critical section held"));
 
 	/* realloc(NULL, ...) is equivalent to malloc(...) */
 	if (addr == NULL)
 		return (malloc(size, mtp, flags));
 
 	/*
 	 * XXX: Should report free of old memory and alloc of new memory to
 	 * per-CPU stats.
 	 */
 
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(addr))
 		return (memguard_realloc(addr, size, mtp, flags));
 #endif
 
 #ifdef DEBUG_REDZONE
 	slab = NULL;
 	alloc = redzone_get_size(addr);
 #else
 	slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK));
 
 	/* Sanity check */
 	KASSERT(slab != NULL,
 	    ("realloc: address %p out of range", (void *)addr));
 
 	/* Get the size of the original block */
 	if (!(slab->us_flags & UMA_SLAB_MALLOC))
 		alloc = slab->us_keg->uk_size;
 	else
 		alloc = slab->us_size;
 
 	/* Reuse the original block if appropriate */
 	if (size <= alloc
 	    && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
 		return (addr);
 #endif /* !DEBUG_REDZONE */
 
 	/* Allocate a new, bigger (or smaller) block */
 	if ((newaddr = malloc(size, mtp, flags)) == NULL)
 		return (NULL);
 
 	/* Copy over original contents */
 	bcopy(addr, newaddr, min(size, alloc));
 	free(addr, mtp);
 	return (newaddr);
 }
 
 /*
  *	reallocf: same as realloc() but free memory on failure.
  */
 void *
 reallocf(void *addr, size_t size, struct malloc_type *mtp, int flags)
 {
 	void *mem;
 
 	if ((mem = realloc(addr, size, mtp, flags)) == NULL)
 		free(addr, mtp);
 	return (mem);
 }
 
 #ifndef __sparc64__
 CTASSERT(VM_KMEM_SIZE_SCALE >= 1);
 #endif
 
 /*
  * Initialize the kernel memory (kmem) arena.
  */
 void
 kmeminit(void)
 {
 	u_long mem_size;
 	u_long tmp;
 
 #ifdef VM_KMEM_SIZE
 	if (vm_kmem_size == 0)
 		vm_kmem_size = VM_KMEM_SIZE;
 #endif
 #ifdef VM_KMEM_SIZE_MIN
 	if (vm_kmem_size_min == 0)
 		vm_kmem_size_min = VM_KMEM_SIZE_MIN;
 #endif
 #ifdef VM_KMEM_SIZE_MAX
 	if (vm_kmem_size_max == 0)
 		vm_kmem_size_max = VM_KMEM_SIZE_MAX;
 #endif
 	/*
 	 * Calculate the amount of kernel virtual address (KVA) space that is
 	 * preallocated to the kmem arena.  In order to support a wide range
 	 * of machines, it is a function of the physical memory size,
 	 * specifically,
 	 *
 	 *	min(max(physical memory size / VM_KMEM_SIZE_SCALE,
 	 *	    VM_KMEM_SIZE_MIN), VM_KMEM_SIZE_MAX)
 	 *
 	 * Every architecture must define an integral value for
 	 * VM_KMEM_SIZE_SCALE.  However, the definitions of VM_KMEM_SIZE_MIN
 	 * and VM_KMEM_SIZE_MAX, which represent respectively the floor and
 	 * ceiling on this preallocation, are optional.  Typically,
 	 * VM_KMEM_SIZE_MAX is itself a function of the available KVA space on
 	 * a given architecture.
 	 */
 	mem_size = vm_cnt.v_page_count;
 	if (mem_size <= 32768) /* delphij XXX 128MB */
 		kmem_zmax = PAGE_SIZE;
 
 	if (vm_kmem_size_scale < 1)
 		vm_kmem_size_scale = VM_KMEM_SIZE_SCALE;
 
 	/*
 	 * Check if we should use defaults for the "vm_kmem_size"
 	 * variable:
 	 */
 	if (vm_kmem_size == 0) {
 		vm_kmem_size = (mem_size / vm_kmem_size_scale) * PAGE_SIZE;
 
 		if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min)
 			vm_kmem_size = vm_kmem_size_min;
 		if (vm_kmem_size_max > 0 && vm_kmem_size >= vm_kmem_size_max)
 			vm_kmem_size = vm_kmem_size_max;
 	}
 
 	/*
 	 * The amount of KVA space that is preallocated to the
 	 * kmem arena can be set statically at compile-time or manually
 	 * through the kernel environment.  However, it is still limited to
 	 * twice the physical memory size, which has been sufficient to handle
 	 * the most severe cases of external fragmentation in the kmem arena. 
 	 */
 	if (vm_kmem_size / 2 / PAGE_SIZE > mem_size)
 		vm_kmem_size = 2 * mem_size * PAGE_SIZE;
 
 	vm_kmem_size = round_page(vm_kmem_size);
 #ifdef DEBUG_MEMGUARD
 	tmp = memguard_fudge(vm_kmem_size, kernel_map);
 #else
 	tmp = vm_kmem_size;
 #endif
 	uma_set_limit(tmp);
 
 #ifdef DEBUG_MEMGUARD
 	/*
 	 * Initialize MemGuard if support compiled in.  MemGuard is a
 	 * replacement allocator used for detecting tamper-after-free
 	 * scenarios as they occur.  It is only used for debugging.
 	 */
 	memguard_init(kernel_arena);
 #endif
 }
 
 /*
  * Initialize the kernel memory allocator
  */
 /* ARGSUSED*/
 static void
 mallocinit(void *dummy)
 {
 	int i;
 	uint8_t indx;
 
 	mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF);
 
 	kmeminit();
 
 	if (kmem_zmax < PAGE_SIZE || kmem_zmax > KMEM_ZMAX)
 		kmem_zmax = KMEM_ZMAX;
 
 	mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal),
 #ifdef INVARIANTS
 	    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
 #else
 	    NULL, NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
 	for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) {
 		int size = kmemzones[indx].kz_size;
 		char *name = kmemzones[indx].kz_name;
 		int subzone;
 
 		for (subzone = 0; subzone < numzones; subzone++) {
 			kmemzones[indx].kz_zone[subzone] =
 			    uma_zcreate(name, size,
 #ifdef INVARIANTS
 			    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
 #else
 			    NULL, NULL, NULL, NULL,
 #endif
 			    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
 		}		    
 		for (;i <= size; i+= KMEM_ZBASE)
 			kmemsize[i >> KMEM_ZSHIFT] = indx;
 
 	}
 }
 SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_SECOND, mallocinit, NULL);
 
 void
 malloc_init(void *data)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 
 	KASSERT(vm_cnt.v_page_count != 0, ("malloc_register before vm_init"));
 
 	mtp = data;
 	if (mtp->ks_magic != M_MAGIC)
 		panic("malloc_init: bad malloc type magic");
 
 	mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO);
 	mtp->ks_handle = mtip;
 	mtp_set_subzone(mtp);
 
 	mtx_lock(&malloc_mtx);
 	mtp->ks_next = kmemstatistics;
 	kmemstatistics = mtp;
 	kmemcount++;
 	mtx_unlock(&malloc_mtx);
 }
 
 void
 malloc_uninit(void *data)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 	struct malloc_type *mtp, *temp;
 	uma_slab_t slab;
 	long temp_allocs, temp_bytes;
 	int i;
 
 	mtp = data;
 	KASSERT(mtp->ks_magic == M_MAGIC,
 	    ("malloc_uninit: bad malloc type magic"));
 	KASSERT(mtp->ks_handle != NULL, ("malloc_deregister: cookie NULL"));
 
 	mtx_lock(&malloc_mtx);
 	mtip = mtp->ks_handle;
 	mtp->ks_handle = NULL;
 	if (mtp != kmemstatistics) {
 		for (temp = kmemstatistics; temp != NULL;
 		    temp = temp->ks_next) {
 			if (temp->ks_next == mtp) {
 				temp->ks_next = mtp->ks_next;
 				break;
 			}
 		}
 		KASSERT(temp,
 		    ("malloc_uninit: type '%s' not found", mtp->ks_shortdesc));
 	} else
 		kmemstatistics = mtp->ks_next;
 	kmemcount--;
 	mtx_unlock(&malloc_mtx);
 
 	/*
 	 * Look for memory leaks.
 	 */
 	temp_allocs = temp_bytes = 0;
 	for (i = 0; i < MAXCPU; i++) {
 		mtsp = &mtip->mti_stats[i];
 		temp_allocs += mtsp->mts_numallocs;
 		temp_allocs -= mtsp->mts_numfrees;
 		temp_bytes += mtsp->mts_memalloced;
 		temp_bytes -= mtsp->mts_memfreed;
 	}
 	if (temp_allocs > 0 || temp_bytes > 0) {
 		printf("Warning: memory type %s leaked memory on destroy "
 		    "(%ld allocations, %ld bytes leaked).\n", mtp->ks_shortdesc,
 		    temp_allocs, temp_bytes);
 	}
 
 	slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
 	uma_zfree_arg(mt_zone, mtip, slab);
 }
 
 struct malloc_type *
 malloc_desc2type(const char *desc)
 {
 	struct malloc_type *mtp;
 
 	mtx_assert(&malloc_mtx, MA_OWNED);
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		if (strcmp(mtp->ks_shortdesc, desc) == 0)
 			return (mtp);
 	}
 	return (NULL);
 }
 
 static int
 sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct malloc_type_stream_header mtsh;
 	struct malloc_type_internal *mtip;
 	struct malloc_type_header mth;
 	struct malloc_type *mtp;
 	int error, i;
 	struct sbuf sbuf;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
 	mtx_lock(&malloc_mtx);
 
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&mtsh, sizeof(mtsh));
 	mtsh.mtsh_version = MALLOC_TYPE_STREAM_VERSION;
 	mtsh.mtsh_maxcpus = MAXCPU;
 	mtsh.mtsh_count = kmemcount;
 	(void)sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh));
 
 	/*
 	 * Insert alternating sequence of type headers and type statistics.
 	 */
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 
 		/*
 		 * Insert type header.
 		 */
 		bzero(&mth, sizeof(mth));
 		strlcpy(mth.mth_name, mtp->ks_shortdesc, MALLOC_MAX_NAME);
 		(void)sbuf_bcat(&sbuf, &mth, sizeof(mth));
 
 		/*
 		 * Insert type statistics for each CPU.
 		 */
 		for (i = 0; i < MAXCPU; i++) {
 			(void)sbuf_bcat(&sbuf, &mtip->mti_stats[i],
 			    sizeof(mtip->mti_stats[i]));
 		}
 	}
 	mtx_unlock(&malloc_mtx);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, malloc_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_kern_malloc_stats, "s,malloc_type_ustats",
     "Return malloc types");
 
 SYSCTL_INT(_kern, OID_AUTO, malloc_count, CTLFLAG_RD, &kmemcount, 0,
     "Count of kernel malloc types");
 
 void
 malloc_type_list(malloc_type_list_func_t *func, void *arg)
 {
 	struct malloc_type *mtp, **bufmtp;
 	int count, i;
 	size_t buflen;
 
 	mtx_lock(&malloc_mtx);
 restart:
 	mtx_assert(&malloc_mtx, MA_OWNED);
 	count = kmemcount;
 	mtx_unlock(&malloc_mtx);
 
 	buflen = sizeof(struct malloc_type *) * count;
 	bufmtp = malloc(buflen, M_TEMP, M_WAITOK);
 
 	mtx_lock(&malloc_mtx);
 
 	if (count < kmemcount) {
 		free(bufmtp, M_TEMP);
 		goto restart;
 	}
 
 	for (mtp = kmemstatistics, i = 0; mtp != NULL; mtp = mtp->ks_next, i++)
 		bufmtp[i] = mtp;
 
 	mtx_unlock(&malloc_mtx);
 
 	for (i = 0; i < count; i++)
 		(func)(bufmtp[i], arg);
 
 	free(bufmtp, M_TEMP);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(malloc, db_show_malloc)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 	uint64_t allocs, frees;
 	uint64_t alloced, freed;
 	int i;
 
 	db_printf("%18s %12s  %12s %12s\n", "Type", "InUse", "MemUse",
 	    "Requests");
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 		allocs = 0;
 		frees = 0;
 		alloced = 0;
 		freed = 0;
 		for (i = 0; i < MAXCPU; i++) {
 			allocs += mtip->mti_stats[i].mts_numallocs;
 			frees += mtip->mti_stats[i].mts_numfrees;
 			alloced += mtip->mti_stats[i].mts_memalloced;
 			freed += mtip->mti_stats[i].mts_memfreed;
 		}
 		db_printf("%18s %12ju %12juK %12ju\n",
 		    mtp->ks_shortdesc, allocs - frees,
 		    (alloced - freed + 1023) / 1024, allocs);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 #if MALLOC_DEBUG_MAXZONES > 1
 DB_SHOW_COMMAND(multizone_matches, db_show_multizone_matches)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 	u_int subzone;
 
 	if (!have_addr) {
 		db_printf("Usage: show multizone_matches <malloc type/addr>\n");
 		return;
 	}
 	mtp = (void *)addr;
 	if (mtp->ks_magic != M_MAGIC) {
 		db_printf("Magic %lx does not match expected %x\n",
 		    mtp->ks_magic, M_MAGIC);
 		return;
 	}
 
 	mtip = mtp->ks_handle;
 	subzone = mtip->mti_zone;
 
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = mtp->ks_handle;
 		if (mtip->mti_zone != subzone)
 			continue;
 		db_printf("%s\n", mtp->ks_shortdesc);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* MALLOC_DEBUG_MAXZONES > 1 */
 #endif /* DDB */
 
 #ifdef MALLOC_PROFILE
 
 static int
 sysctl_kern_mprof(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	uint64_t count;
 	uint64_t waste;
 	uint64_t mem;
 	int error;
 	int rsize;
 	int size;
 	int i;
 
 	waste = 0;
 	mem = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_printf(&sbuf, 
 	    "\n  Size                    Requests  Real Size\n");
 	for (i = 0; i < KMEM_ZSIZE; i++) {
 		size = i << KMEM_ZSHIFT;
 		rsize = kmemzones[kmemsize[i]].kz_size;
 		count = (long long unsigned)krequests[i];
 
 		sbuf_printf(&sbuf, "%6d%28llu%11d\n", size,
 		    (unsigned long long)count, rsize);
 
 		if ((rsize * count) > (size * count))
 			waste += (rsize * count) - (size * count);
 		mem += (rsize * count);
 	}
 	sbuf_printf(&sbuf,
 	    "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n",
 	    (unsigned long long)mem, (unsigned long long)waste);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD,
     NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling");
 #endif /* MALLOC_PROFILE */
Index: head/sys/kern/subr_vmem.c
===================================================================
--- head/sys/kern/subr_vmem.c	(revision 335067)
+++ head/sys/kern/subr_vmem.c	(revision 335068)
@@ -1,1625 +1,1628 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
  * Copyright (c) 2013 EMC Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * From:
  *	$NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $
  *	$NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $
  */
 
 /*
  * reference:
  * -	Magazines and Vmem: Extending the Slab Allocator
  *	to Many CPUs and Arbitrary Resources
  *	http://www.usenix.org/event/usenix01/bonwick.html
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 #include <sys/callout.h>
 #include <sys/hash.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/smp.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 
 #include "opt_vm.h"
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/uma_int.h>
 
 int	vmem_startup_count(void);
 
 #define	VMEM_OPTORDER		5
 #define	VMEM_OPTVALUE		(1 << VMEM_OPTORDER)
 #define	VMEM_MAXORDER						\
     (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER)
 
 #define	VMEM_HASHSIZE_MIN	16
 #define	VMEM_HASHSIZE_MAX	131072
 
 #define	VMEM_QCACHE_IDX_MAX	16
 
 #define	VMEM_FITMASK	(M_BESTFIT | M_FIRSTFIT)
 
 #define	VMEM_FLAGS						\
     (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT)
 
 #define	BT_FLAGS	(M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM)
 
 #define	QC_NAME_MAX	16
 
 /*
  * Data structures private to vmem.
  */
 MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures");
 
 typedef struct vmem_btag bt_t;
 
 TAILQ_HEAD(vmem_seglist, vmem_btag);
 LIST_HEAD(vmem_freelist, vmem_btag);
 LIST_HEAD(vmem_hashlist, vmem_btag);
 
 struct qcache {
 	uma_zone_t	qc_cache;
 	vmem_t 		*qc_vmem;
 	vmem_size_t	qc_size;
 	char		qc_name[QC_NAME_MAX];
 };
 typedef struct qcache qcache_t;
 #define	QC_POOL_TO_QCACHE(pool)	((qcache_t *)(pool->pr_qcache))
 
 #define	VMEM_NAME_MAX	16
 
 /* vmem arena */
 struct vmem {
 	struct mtx_padalign	vm_lock;
 	struct cv		vm_cv;
 	char			vm_name[VMEM_NAME_MAX+1];
 	LIST_ENTRY(vmem)	vm_alllist;
 	struct vmem_hashlist	vm_hash0[VMEM_HASHSIZE_MIN];
 	struct vmem_freelist	vm_freelist[VMEM_MAXORDER];
 	struct vmem_seglist	vm_seglist;
 	struct vmem_hashlist	*vm_hashlist;
 	vmem_size_t		vm_hashsize;
 
 	/* Constant after init */
 	vmem_size_t		vm_qcache_max;
 	vmem_size_t		vm_quantum_mask;
 	vmem_size_t		vm_import_quantum;
 	int			vm_quantum_shift;
 
 	/* Written on alloc/free */
 	LIST_HEAD(, vmem_btag)	vm_freetags;
 	int			vm_nfreetags;
 	int			vm_nbusytag;
 	vmem_size_t		vm_inuse;
 	vmem_size_t		vm_size;
 	vmem_size_t		vm_limit;
 
 	/* Used on import. */
 	vmem_import_t		*vm_importfn;
 	vmem_release_t		*vm_releasefn;
 	void			*vm_arg;
 
 	/* Space exhaustion callback. */
 	vmem_reclaim_t		*vm_reclaimfn;
 
 	/* quantum cache */
 	qcache_t		vm_qcache[VMEM_QCACHE_IDX_MAX];
 };
 
 /* boundary tag */
 struct vmem_btag {
 	TAILQ_ENTRY(vmem_btag) bt_seglist;
 	union {
 		LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */
 		LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */
 	} bt_u;
 #define	bt_hashlist	bt_u.u_hashlist
 #define	bt_freelist	bt_u.u_freelist
 	vmem_addr_t	bt_start;
 	vmem_size_t	bt_size;
 	int		bt_type;
 };
 
 #define	BT_TYPE_SPAN		1	/* Allocated from importfn */
 #define	BT_TYPE_SPAN_STATIC	2	/* vmem_add() or create. */
 #define	BT_TYPE_FREE		3	/* Available space. */
 #define	BT_TYPE_BUSY		4	/* Used space. */
 #define	BT_ISSPAN_P(bt)	((bt)->bt_type <= BT_TYPE_SPAN_STATIC)
 
 #define	BT_END(bt)	((bt)->bt_start + (bt)->bt_size - 1)
 
 #if defined(DIAGNOSTIC)
 static int enable_vmem_check = 1;
 SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN,
     &enable_vmem_check, 0, "Enable vmem check");
 static void vmem_check(vmem_t *);
 #endif
 
 static struct callout	vmem_periodic_ch;
 static int		vmem_periodic_interval;
 static struct task	vmem_periodic_wk;
 
 static struct mtx_padalign __exclusive_cache_line vmem_list_lock;
 static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
 static uma_zone_t vmem_zone;
 
 /* ---- misc */
 #define	VMEM_CONDVAR_INIT(vm, wchan)	cv_init(&vm->vm_cv, wchan)
 #define	VMEM_CONDVAR_DESTROY(vm)	cv_destroy(&vm->vm_cv)
 #define	VMEM_CONDVAR_WAIT(vm)		cv_wait(&vm->vm_cv, &vm->vm_lock)
 #define	VMEM_CONDVAR_BROADCAST(vm)	cv_broadcast(&vm->vm_cv)
 
 
 #define	VMEM_LOCK(vm)		mtx_lock(&vm->vm_lock)
 #define	VMEM_TRYLOCK(vm)	mtx_trylock(&vm->vm_lock)
 #define	VMEM_UNLOCK(vm)		mtx_unlock(&vm->vm_lock)
 #define	VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF)
 #define	VMEM_LOCK_DESTROY(vm)	mtx_destroy(&vm->vm_lock)
 #define	VMEM_ASSERT_LOCKED(vm)	mtx_assert(&vm->vm_lock, MA_OWNED);
 
 #define	VMEM_ALIGNUP(addr, align)	(-(-(addr) & -(align)))
 
 #define	VMEM_CROSS_P(addr1, addr2, boundary) \
 	((((addr1) ^ (addr2)) & -(boundary)) != 0)
 
 #define	ORDER2SIZE(order)	((order) < VMEM_OPTVALUE ? ((order) + 1) : \
     (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1)))
 #define	SIZE2ORDER(size)	((size) <= VMEM_OPTVALUE ? ((size) - 1) : \
     (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2)))
 
 /*
  * Maximum number of boundary tags that may be required to satisfy an
  * allocation.  Two may be required to import.  Another two may be
  * required to clip edges.
  */
 #define	BT_MAXALLOC	4
 
 /*
  * Max free limits the number of locally cached boundary tags.  We
  * just want to avoid hitting the zone allocator for every call.
  */
 #define BT_MAXFREE	(BT_MAXALLOC * 8)
 
 /* Allocator for boundary tags. */
 static uma_zone_t vmem_bt_zone;
 
 /* boot time arena storage. */
 static struct vmem kernel_arena_storage;
 static struct vmem buffer_arena_storage;
 static struct vmem transient_arena_storage;
 /* kernel and kmem arenas are aliased for backwards KPI compat. */
 vmem_t *kernel_arena = &kernel_arena_storage;
+#if VM_NRESERVLEVEL > 0
+vmem_t *kernel_rwx_arena = NULL;
+#endif
 vmem_t *kmem_arena = &kernel_arena_storage;
 vmem_t *buffer_arena = &buffer_arena_storage;
 vmem_t *transient_arena = &transient_arena_storage;
 
 #ifdef DEBUG_MEMGUARD
 static struct vmem memguard_arena_storage;
 vmem_t *memguard_arena = &memguard_arena_storage;
 #endif
 
 /*
  * Fill the vmem's boundary tag cache.  We guarantee that boundary tag
  * allocation will not fail once bt_fill() passes.  To do so we cache
  * at least the maximum possible tag allocations in the arena.
  */
 static int
 bt_fill(vmem_t *vm, int flags)
 {
 	bt_t *bt;
 
 	VMEM_ASSERT_LOCKED(vm);
 
 	/*
 	 * Only allow the kernel arena and arenas derived from kernel arena to
 	 * dip into reserve tags.  They are where new tags come from.
 	 */
 	flags &= BT_FLAGS;
 	if (vm != kernel_arena && vm->vm_arg != kernel_arena)
 		flags &= ~M_USE_RESERVE;
 
 	/*
 	 * Loop until we meet the reserve.  To minimize the lock shuffle
 	 * and prevent simultaneous fills we first try a NOWAIT regardless
 	 * of the caller's flags.  Specify M_NOVM so we don't recurse while
 	 * holding a vmem lock.
 	 */
 	while (vm->vm_nfreetags < BT_MAXALLOC) {
 		bt = uma_zalloc(vmem_bt_zone,
 		    (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM);
 		if (bt == NULL) {
 			VMEM_UNLOCK(vm);
 			bt = uma_zalloc(vmem_bt_zone, flags);
 			VMEM_LOCK(vm);
 			if (bt == NULL && (flags & M_NOWAIT) != 0)
 				break;
 		}
 		LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
 		vm->vm_nfreetags++;
 	}
 
 	if (vm->vm_nfreetags < BT_MAXALLOC)
 		return ENOMEM;
 
 	return 0;
 }
 
 /*
  * Pop a tag off of the freetag stack.
  */
 static bt_t *
 bt_alloc(vmem_t *vm)
 {
 	bt_t *bt;
 
 	VMEM_ASSERT_LOCKED(vm);
 	bt = LIST_FIRST(&vm->vm_freetags);
 	MPASS(bt != NULL);
 	LIST_REMOVE(bt, bt_freelist);
 	vm->vm_nfreetags--;
 
 	return bt;
 }
 
 /*
  * Trim the per-vmem free list.  Returns with the lock released to
  * avoid allocator recursions.
  */
 static void
 bt_freetrim(vmem_t *vm, int freelimit)
 {
 	LIST_HEAD(, vmem_btag) freetags;
 	bt_t *bt;
 
 	LIST_INIT(&freetags);
 	VMEM_ASSERT_LOCKED(vm);
 	while (vm->vm_nfreetags > freelimit) {
 		bt = LIST_FIRST(&vm->vm_freetags);
 		LIST_REMOVE(bt, bt_freelist);
 		vm->vm_nfreetags--;
 		LIST_INSERT_HEAD(&freetags, bt, bt_freelist);
 	}
 	VMEM_UNLOCK(vm);
 	while ((bt = LIST_FIRST(&freetags)) != NULL) {
 		LIST_REMOVE(bt, bt_freelist);
 		uma_zfree(vmem_bt_zone, bt);
 	}
 }
 
 static inline void
 bt_free(vmem_t *vm, bt_t *bt)
 {
 
 	VMEM_ASSERT_LOCKED(vm);
 	MPASS(LIST_FIRST(&vm->vm_freetags) != bt);
 	LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
 	vm->vm_nfreetags++;
 }
 
 /*
  * freelist[0] ... [1, 1]
  * freelist[1] ... [2, 2]
  *  :
  * freelist[29] ... [30, 30]
  * freelist[30] ... [31, 31]
  * freelist[31] ... [32, 63]
  * freelist[33] ... [64, 127]
  *  :
  * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1]
  *  :
  */
 
 static struct vmem_freelist *
 bt_freehead_tofree(vmem_t *vm, vmem_size_t size)
 {
 	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
 	const int idx = SIZE2ORDER(qsize);
 
 	MPASS(size != 0 && qsize != 0);
 	MPASS((size & vm->vm_quantum_mask) == 0);
 	MPASS(idx >= 0);
 	MPASS(idx < VMEM_MAXORDER);
 
 	return &vm->vm_freelist[idx];
 }
 
 /*
  * bt_freehead_toalloc: return the freelist for the given size and allocation
  * strategy.
  *
  * For M_FIRSTFIT, return the list in which any blocks are large enough
  * for the requested size.  otherwise, return the list which can have blocks
  * large enough for the requested size.
  */
 static struct vmem_freelist *
 bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat)
 {
 	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
 	int idx = SIZE2ORDER(qsize);
 
 	MPASS(size != 0 && qsize != 0);
 	MPASS((size & vm->vm_quantum_mask) == 0);
 
 	if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) {
 		idx++;
 		/* check too large request? */
 	}
 	MPASS(idx >= 0);
 	MPASS(idx < VMEM_MAXORDER);
 
 	return &vm->vm_freelist[idx];
 }
 
 /* ---- boundary tag hash */
 
 static struct vmem_hashlist *
 bt_hashhead(vmem_t *vm, vmem_addr_t addr)
 {
 	struct vmem_hashlist *list;
 	unsigned int hash;
 
 	hash = hash32_buf(&addr, sizeof(addr), 0);
 	list = &vm->vm_hashlist[hash % vm->vm_hashsize];
 
 	return list;
 }
 
 static bt_t *
 bt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
 {
 	struct vmem_hashlist *list;
 	bt_t *bt;
 
 	VMEM_ASSERT_LOCKED(vm);
 	list = bt_hashhead(vm, addr); 
 	LIST_FOREACH(bt, list, bt_hashlist) {
 		if (bt->bt_start == addr) {
 			break;
 		}
 	}
 
 	return bt;
 }
 
 static void
 bt_rembusy(vmem_t *vm, bt_t *bt)
 {
 
 	VMEM_ASSERT_LOCKED(vm);
 	MPASS(vm->vm_nbusytag > 0);
 	vm->vm_inuse -= bt->bt_size;
 	vm->vm_nbusytag--;
 	LIST_REMOVE(bt, bt_hashlist);
 }
 
 static void
 bt_insbusy(vmem_t *vm, bt_t *bt)
 {
 	struct vmem_hashlist *list;
 
 	VMEM_ASSERT_LOCKED(vm);
 	MPASS(bt->bt_type == BT_TYPE_BUSY);
 
 	list = bt_hashhead(vm, bt->bt_start);
 	LIST_INSERT_HEAD(list, bt, bt_hashlist);
 	vm->vm_nbusytag++;
 	vm->vm_inuse += bt->bt_size;
 }
 
 /* ---- boundary tag list */
 
 static void
 bt_remseg(vmem_t *vm, bt_t *bt)
 {
 
 	TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
 	bt_free(vm, bt);
 }
 
 static void
 bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
 {
 
 	TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
 }
 
 static void
 bt_insseg_tail(vmem_t *vm, bt_t *bt)
 {
 
 	TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
 }
 
 static void
 bt_remfree(vmem_t *vm, bt_t *bt)
 {
 
 	MPASS(bt->bt_type == BT_TYPE_FREE);
 
 	LIST_REMOVE(bt, bt_freelist);
 }
 
 static void
 bt_insfree(vmem_t *vm, bt_t *bt)
 {
 	struct vmem_freelist *list;
 
 	list = bt_freehead_tofree(vm, bt->bt_size);
 	LIST_INSERT_HEAD(list, bt, bt_freelist);
 }
 
 /* ---- vmem internal functions */
 
 /*
  * Import from the arena into the quantum cache in UMA.
  */
 static int
 qc_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	qcache_t *qc;
 	vmem_addr_t addr;
 	int i;
 
 	qc = arg;
 	if ((flags & VMEM_FITMASK) == 0)
 		flags |= M_BESTFIT;
 	for (i = 0; i < cnt; i++) {
 		if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0,
 		    VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0)
 			break;
 		store[i] = (void *)addr;
 		/* Only guarantee one allocation. */
 		flags &= ~M_WAITOK;
 		flags |= M_NOWAIT;
 	}
 	return i;
 }
 
 /*
  * Release memory from the UMA cache to the arena.
  */
 static void
 qc_release(void *arg, void **store, int cnt)
 {
 	qcache_t *qc;
 	int i;
 
 	qc = arg;
 	for (i = 0; i < cnt; i++)
 		vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size);
 }
 
 static void
 qc_init(vmem_t *vm, vmem_size_t qcache_max)
 {
 	qcache_t *qc;
 	vmem_size_t size;
 	int qcache_idx_max;
 	int i;
 
 	MPASS((qcache_max & vm->vm_quantum_mask) == 0);
 	qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift,
 	    VMEM_QCACHE_IDX_MAX);
 	vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift;
 	for (i = 0; i < qcache_idx_max; i++) {
 		qc = &vm->vm_qcache[i];
 		size = (i + 1) << vm->vm_quantum_shift;
 		snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
 		    vm->vm_name, size);
 		qc->qc_vmem = vm;
 		qc->qc_size = size;
 		qc->qc_cache = uma_zcache_create(qc->qc_name, size,
 		    NULL, NULL, NULL, NULL, qc_import, qc_release, qc,
 		    UMA_ZONE_VM);
 		MPASS(qc->qc_cache);
 	}
 }
 
 static void
 qc_destroy(vmem_t *vm)
 {
 	int qcache_idx_max;
 	int i;
 
 	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
 	for (i = 0; i < qcache_idx_max; i++)
 		uma_zdestroy(vm->vm_qcache[i].qc_cache);
 }
 
 static void
 qc_drain(vmem_t *vm)
 {
 	int qcache_idx_max;
 	int i;
 
 	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
 	for (i = 0; i < qcache_idx_max; i++)
 		zone_drain(vm->vm_qcache[i].qc_cache);
 }
 
 #ifndef UMA_MD_SMALL_ALLOC
 
 static struct mtx_padalign __exclusive_cache_line vmem_bt_lock;
 
 /*
  * vmem_bt_alloc:  Allocate a new page of boundary tags.
  *
  * On architectures with uma_small_alloc there is no recursion; no address
  * space need be allocated to allocate boundary tags.  For the others, we
  * must handle recursion.  Boundary tags are necessary to allocate new
  * boundary tags.
  *
  * UMA guarantees that enough tags are held in reserve to allocate a new
  * page of kva.  We dip into this reserve by specifying M_USE_RESERVE only
  * when allocating the page to hold new boundary tags.  In this way the
  * reserve is automatically filled by the allocation that uses the reserve.
  * 
  * We still have to guarantee that the new tags are allocated atomically since
  * many threads may try concurrently.  The bt_lock provides this guarantee.
  * We convert WAITOK allocations to NOWAIT and then handle the blocking here
  * on failure.  It's ok to return NULL for a WAITOK allocation as UMA will
  * loop again after checking to see if we lost the race to allocate.
  *
  * There is a small race between vmem_bt_alloc() returning the page and the
  * zone lock being acquired to add the page to the zone.  For WAITOK
  * allocations we just pause briefly.  NOWAIT may experience a transient
  * failure.  To alleviate this we permit a small number of simultaneous
  * fills to proceed concurrently so NOWAIT is less likely to fail unless
  * we are really out of KVA.
  */
 static void *
 vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	vmem_addr_t addr;
 
 	*pflag = UMA_SLAB_KERNEL;
 
 	/*
 	 * Single thread boundary tag allocation so that the address space
 	 * and memory are added in one atomic operation.
 	 */
 	mtx_lock(&vmem_bt_lock);
 	if (vmem_xalloc(vm_dom[domain].vmd_kernel_arena, bytes, 0, 0, 0,
 	    VMEM_ADDR_MIN, VMEM_ADDR_MAX,
 	    M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT, &addr) == 0) {
 		if (kmem_back_domain(domain, kernel_object, addr, bytes,
 		    M_NOWAIT | M_USE_RESERVE) == 0) {
 			mtx_unlock(&vmem_bt_lock);
 			return ((void *)addr);
 		}
 		vmem_xfree(vm_dom[domain].vmd_kernel_arena, addr, bytes);
 		mtx_unlock(&vmem_bt_lock);
 		/*
 		 * Out of memory, not address space.  This may not even be
 		 * possible due to M_USE_RESERVE page allocation.
 		 */
 		if (wait & M_WAITOK)
 			vm_wait_domain(domain);
 		return (NULL);
 	}
 	mtx_unlock(&vmem_bt_lock);
 	/*
 	 * We're either out of address space or lost a fill race.
 	 */
 	if (wait & M_WAITOK)
 		pause("btalloc", 1);
 
 	return (NULL);
 }
 
 /*
  * How many pages do we need to startup_alloc.
  */
 int
 vmem_startup_count(void)
 {
 
 	return (howmany(BT_MAXALLOC,
 	    UMA_SLAB_SPACE / sizeof(struct vmem_btag)));
 }
 #endif
 
 void
 vmem_startup(void)
 {
 
 	mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF);
 	vmem_zone = uma_zcreate("vmem",
 	    sizeof(struct vmem), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_VM);
 	vmem_bt_zone = uma_zcreate("vmem btag",
 	    sizeof(struct vmem_btag), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 #ifndef UMA_MD_SMALL_ALLOC
 	mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF);
 	uma_prealloc(vmem_bt_zone, BT_MAXALLOC);
 	/*
 	 * Reserve enough tags to allocate new tags.  We allow multiple
 	 * CPUs to attempt to allocate new tags concurrently to limit
 	 * false restarts in UMA.
 	 */
 	uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2);
 	uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc);
 #endif
 }
 
 /* ---- rehash */
 
 static int
 vmem_rehash(vmem_t *vm, vmem_size_t newhashsize)
 {
 	bt_t *bt;
 	int i;
 	struct vmem_hashlist *newhashlist;
 	struct vmem_hashlist *oldhashlist;
 	vmem_size_t oldhashsize;
 
 	MPASS(newhashsize > 0);
 
 	newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize,
 	    M_VMEM, M_NOWAIT);
 	if (newhashlist == NULL)
 		return ENOMEM;
 	for (i = 0; i < newhashsize; i++) {
 		LIST_INIT(&newhashlist[i]);
 	}
 
 	VMEM_LOCK(vm);
 	oldhashlist = vm->vm_hashlist;
 	oldhashsize = vm->vm_hashsize;
 	vm->vm_hashlist = newhashlist;
 	vm->vm_hashsize = newhashsize;
 	if (oldhashlist == NULL) {
 		VMEM_UNLOCK(vm);
 		return 0;
 	}
 	for (i = 0; i < oldhashsize; i++) {
 		while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
 			bt_rembusy(vm, bt);
 			bt_insbusy(vm, bt);
 		}
 	}
 	VMEM_UNLOCK(vm);
 
 	if (oldhashlist != vm->vm_hash0) {
 		free(oldhashlist, M_VMEM);
 	}
 
 	return 0;
 }
 
 static void
 vmem_periodic_kick(void *dummy)
 {
 
 	taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk);
 }
 
 static void
 vmem_periodic(void *unused, int pending)
 {
 	vmem_t *vm;
 	vmem_size_t desired;
 	vmem_size_t current;
 
 	mtx_lock(&vmem_list_lock);
 	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
 #ifdef DIAGNOSTIC
 		/* Convenient time to verify vmem state. */
 		if (enable_vmem_check == 1) {
 			VMEM_LOCK(vm);
 			vmem_check(vm);
 			VMEM_UNLOCK(vm);
 		}
 #endif
 		desired = 1 << flsl(vm->vm_nbusytag);
 		desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN),
 		    VMEM_HASHSIZE_MAX);
 		current = vm->vm_hashsize;
 
 		/* Grow in powers of two.  Shrink less aggressively. */
 		if (desired >= current * 2 || desired * 4 <= current)
 			vmem_rehash(vm, desired);
 
 		/*
 		 * Periodically wake up threads waiting for resources,
 		 * so they could ask for reclamation again.
 		 */
 		VMEM_CONDVAR_BROADCAST(vm);
 	}
 	mtx_unlock(&vmem_list_lock);
 
 	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
 	    vmem_periodic_kick, NULL);
 }
 
 static void
 vmem_start_callout(void *unused)
 {
 
 	TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL);
 	vmem_periodic_interval = hz * 10;
 	callout_init(&vmem_periodic_ch, 1);
 	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
 	    vmem_periodic_kick, NULL);
 }
 SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL);
 
 static void
 vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type)
 {
 	bt_t *btspan;
 	bt_t *btfree;
 
 	MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC);
 	MPASS((size & vm->vm_quantum_mask) == 0);
 
 	btspan = bt_alloc(vm);
 	btspan->bt_type = type;
 	btspan->bt_start = addr;
 	btspan->bt_size = size;
 	bt_insseg_tail(vm, btspan);
 
 	btfree = bt_alloc(vm);
 	btfree->bt_type = BT_TYPE_FREE;
 	btfree->bt_start = addr;
 	btfree->bt_size = size;
 	bt_insseg(vm, btfree, btspan);
 	bt_insfree(vm, btfree);
 
 	vm->vm_size += size;
 }
 
 static void
 vmem_destroy1(vmem_t *vm)
 {
 	bt_t *bt;
 
 	/*
 	 * Drain per-cpu quantum caches.
 	 */
 	qc_destroy(vm);
 
 	/*
 	 * The vmem should now only contain empty segments.
 	 */
 	VMEM_LOCK(vm);
 	MPASS(vm->vm_nbusytag == 0);
 
 	while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL)
 		bt_remseg(vm, bt);
 
 	if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0)
 		free(vm->vm_hashlist, M_VMEM);
 
 	bt_freetrim(vm, 0);
 
 	VMEM_CONDVAR_DESTROY(vm);
 	VMEM_LOCK_DESTROY(vm);
 	uma_zfree(vmem_zone, vm);
 }
 
 static int
 vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
 {
 	vmem_addr_t addr;
 	int error;
 
 	if (vm->vm_importfn == NULL)
 		return (EINVAL);
 
 	/*
 	 * To make sure we get a span that meets the alignment we double it
 	 * and add the size to the tail.  This slightly overestimates.
 	 */
 	if (align != vm->vm_quantum_mask + 1)
 		size = (align * 2) + size;
 	size = roundup(size, vm->vm_import_quantum);
 
 	if (vm->vm_limit != 0 && vm->vm_limit < vm->vm_size + size)
 		return (ENOMEM);
 
 	/*
 	 * Hide MAXALLOC tags so we're guaranteed to be able to add this
 	 * span and the tag we want to allocate from it.
 	 */
 	MPASS(vm->vm_nfreetags >= BT_MAXALLOC);
 	vm->vm_nfreetags -= BT_MAXALLOC;
 	VMEM_UNLOCK(vm);
 	error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
 	VMEM_LOCK(vm);
 	vm->vm_nfreetags += BT_MAXALLOC;
 	if (error)
 		return (ENOMEM);
 
 	vmem_add1(vm, addr, size, BT_TYPE_SPAN);
 
 	return 0;
 }
 
 /*
  * vmem_fit: check if a bt can satisfy the given restrictions.
  *
  * it's a caller's responsibility to ensure the region is big enough
  * before calling us.
  */
 static int
 vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
     vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
     vmem_addr_t maxaddr, vmem_addr_t *addrp)
 {
 	vmem_addr_t start;
 	vmem_addr_t end;
 
 	MPASS(size > 0);
 	MPASS(bt->bt_size >= size); /* caller's responsibility */
 
 	/*
 	 * XXX assumption: vmem_addr_t and vmem_size_t are
 	 * unsigned integer of the same size.
 	 */
 
 	start = bt->bt_start;
 	if (start < minaddr) {
 		start = minaddr;
 	}
 	end = BT_END(bt);
 	if (end > maxaddr)
 		end = maxaddr;
 	if (start > end) 
 		return (ENOMEM);
 
 	start = VMEM_ALIGNUP(start - phase, align) + phase;
 	if (start < bt->bt_start)
 		start += align;
 	if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
 		MPASS(align < nocross);
 		start = VMEM_ALIGNUP(start - phase, nocross) + phase;
 	}
 	if (start <= end && end - start >= size - 1) {
 		MPASS((start & (align - 1)) == phase);
 		MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross));
 		MPASS(minaddr <= start);
 		MPASS(maxaddr == 0 || start + size - 1 <= maxaddr);
 		MPASS(bt->bt_start <= start);
 		MPASS(BT_END(bt) - start >= size - 1);
 		*addrp = start;
 
 		return (0);
 	}
 	return (ENOMEM);
 }
 
 /*
  * vmem_clip:  Trim the boundary tag edges to the requested start and size.
  */
 static void
 vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size)
 {
 	bt_t *btnew;
 	bt_t *btprev;
 
 	VMEM_ASSERT_LOCKED(vm);
 	MPASS(bt->bt_type == BT_TYPE_FREE);
 	MPASS(bt->bt_size >= size);
 	bt_remfree(vm, bt);
 	if (bt->bt_start != start) {
 		btprev = bt_alloc(vm);
 		btprev->bt_type = BT_TYPE_FREE;
 		btprev->bt_start = bt->bt_start;
 		btprev->bt_size = start - bt->bt_start;
 		bt->bt_start = start;
 		bt->bt_size -= btprev->bt_size;
 		bt_insfree(vm, btprev);
 		bt_insseg(vm, btprev,
 		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
 	}
 	MPASS(bt->bt_start == start);
 	if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
 		/* split */
 		btnew = bt_alloc(vm);
 		btnew->bt_type = BT_TYPE_BUSY;
 		btnew->bt_start = bt->bt_start;
 		btnew->bt_size = size;
 		bt->bt_start = bt->bt_start + size;
 		bt->bt_size -= size;
 		bt_insfree(vm, bt);
 		bt_insseg(vm, btnew,
 		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
 		bt_insbusy(vm, btnew);
 		bt = btnew;
 	} else {
 		bt->bt_type = BT_TYPE_BUSY;
 		bt_insbusy(vm, bt);
 	}
 	MPASS(bt->bt_size >= size);
 	bt->bt_type = BT_TYPE_BUSY;
 }
 
 /* ---- vmem API */
 
 void
 vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
      vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum)
 {
 
 	VMEM_LOCK(vm);
 	vm->vm_importfn = importfn;
 	vm->vm_releasefn = releasefn;
 	vm->vm_arg = arg;
 	vm->vm_import_quantum = import_quantum;
 	VMEM_UNLOCK(vm);
 }
 
 void
 vmem_set_limit(vmem_t *vm, vmem_size_t limit)
 {
 
 	VMEM_LOCK(vm);
 	vm->vm_limit = limit;
 	VMEM_UNLOCK(vm);
 }
 
 void
 vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn)
 {
 
 	VMEM_LOCK(vm);
 	vm->vm_reclaimfn = reclaimfn;
 	VMEM_UNLOCK(vm);
 }
 
 /*
  * vmem_init: Initializes vmem arena.
  */
 vmem_t *
 vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size,
     vmem_size_t quantum, vmem_size_t qcache_max, int flags)
 {
 	int i;
 
 	MPASS(quantum > 0);
 	MPASS((quantum & (quantum - 1)) == 0);
 
 	bzero(vm, sizeof(*vm));
 
 	VMEM_CONDVAR_INIT(vm, name);
 	VMEM_LOCK_INIT(vm, name);
 	vm->vm_nfreetags = 0;
 	LIST_INIT(&vm->vm_freetags);
 	strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
 	vm->vm_quantum_mask = quantum - 1;
 	vm->vm_quantum_shift = flsl(quantum) - 1;
 	vm->vm_nbusytag = 0;
 	vm->vm_size = 0;
 	vm->vm_limit = 0;
 	vm->vm_inuse = 0;
 	qc_init(vm, qcache_max);
 
 	TAILQ_INIT(&vm->vm_seglist);
 	for (i = 0; i < VMEM_MAXORDER; i++) {
 		LIST_INIT(&vm->vm_freelist[i]);
 	}
 	memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
 	vm->vm_hashsize = VMEM_HASHSIZE_MIN;
 	vm->vm_hashlist = vm->vm_hash0;
 
 	if (size != 0) {
 		if (vmem_add(vm, base, size, flags) != 0) {
 			vmem_destroy1(vm);
 			return NULL;
 		}
 	}
 
 	mtx_lock(&vmem_list_lock);
 	LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
 	mtx_unlock(&vmem_list_lock);
 
 	return vm;
 }
 
 /*
  * vmem_create: create an arena.
  */
 vmem_t *
 vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
     vmem_size_t quantum, vmem_size_t qcache_max, int flags)
 {
 
 	vmem_t *vm;
 
 	vm = uma_zalloc(vmem_zone, flags & (M_WAITOK|M_NOWAIT));
 	if (vm == NULL)
 		return (NULL);
 	if (vmem_init(vm, name, base, size, quantum, qcache_max,
 	    flags) == NULL)
 		return (NULL);
 	return (vm);
 }
 
 void
 vmem_destroy(vmem_t *vm)
 {
 
 	mtx_lock(&vmem_list_lock);
 	LIST_REMOVE(vm, vm_alllist);
 	mtx_unlock(&vmem_list_lock);
 
 	vmem_destroy1(vm);
 }
 
 vmem_size_t
 vmem_roundup_size(vmem_t *vm, vmem_size_t size)
 {
 
 	return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
 }
 
 /*
  * vmem_alloc: allocate resource from the arena.
  */
 int
 vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp)
 {
 	const int strat __unused = flags & VMEM_FITMASK;
 	qcache_t *qc;
 
 	flags &= VMEM_FLAGS;
 	MPASS(size > 0);
 	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
 	if ((flags & M_NOWAIT) == 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc");
 
 	if (size <= vm->vm_qcache_max) {
 		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
 		*addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags);
 		if (*addrp == 0)
 			return (ENOMEM);
 		return (0);
 	}
 
 	return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
 	    flags, addrp);
 }
 
 int
 vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
     const vmem_size_t phase, const vmem_size_t nocross,
     const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags,
     vmem_addr_t *addrp)
 {
 	const vmem_size_t size = vmem_roundup_size(vm, size0);
 	struct vmem_freelist *list;
 	struct vmem_freelist *first;
 	struct vmem_freelist *end;
 	vmem_size_t avail;
 	bt_t *bt;
 	int error;
 	int strat;
 
 	flags &= VMEM_FLAGS;
 	strat = flags & VMEM_FITMASK;
 	MPASS(size0 > 0);
 	MPASS(size > 0);
 	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
 	MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK));
 	if ((flags & M_NOWAIT) == 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc");
 	MPASS((align & vm->vm_quantum_mask) == 0);
 	MPASS((align & (align - 1)) == 0);
 	MPASS((phase & vm->vm_quantum_mask) == 0);
 	MPASS((nocross & vm->vm_quantum_mask) == 0);
 	MPASS((nocross & (nocross - 1)) == 0);
 	MPASS((align == 0 && phase == 0) || phase < align);
 	MPASS(nocross == 0 || nocross >= size);
 	MPASS(minaddr <= maxaddr);
 	MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
 
 	if (align == 0)
 		align = vm->vm_quantum_mask + 1;
 
 	*addrp = 0;
 	end = &vm->vm_freelist[VMEM_MAXORDER];
 	/*
 	 * choose a free block from which we allocate.
 	 */
 	first = bt_freehead_toalloc(vm, size, strat);
 	VMEM_LOCK(vm);
 	for (;;) {
 		/*
 		 * Make sure we have enough tags to complete the
 		 * operation.
 		 */
 		if (vm->vm_nfreetags < BT_MAXALLOC &&
 		    bt_fill(vm, flags) != 0) {
 			error = ENOMEM;
 			break;
 		}
 		/*
 	 	 * Scan freelists looking for a tag that satisfies the
 		 * allocation.  If we're doing BESTFIT we may encounter
 		 * sizes below the request.  If we're doing FIRSTFIT we
 		 * inspect only the first element from each list.
 		 */
 		for (list = first; list < end; list++) {
 			LIST_FOREACH(bt, list, bt_freelist) {
 				if (bt->bt_size >= size) {
 					error = vmem_fit(bt, size, align, phase,
 					    nocross, minaddr, maxaddr, addrp);
 					if (error == 0) {
 						vmem_clip(vm, bt, *addrp, size);
 						goto out;
 					}
 				}
 				/* FIRST skips to the next list. */
 				if (strat == M_FIRSTFIT)
 					break;
 			}
 		}
 		/*
 		 * Retry if the fast algorithm failed.
 		 */
 		if (strat == M_FIRSTFIT) {
 			strat = M_BESTFIT;
 			first = bt_freehead_toalloc(vm, size, strat);
 			continue;
 		}
 		/*
 		 * XXX it is possible to fail to meet restrictions with the
 		 * imported region.  It is up to the user to specify the
 		 * import quantum such that it can satisfy any allocation.
 		 */
 		if (vmem_import(vm, size, align, flags) == 0)
 			continue;
 
 		/*
 		 * Try to free some space from the quantum cache or reclaim
 		 * functions if available.
 		 */
 		if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) {
 			avail = vm->vm_size - vm->vm_inuse;
 			VMEM_UNLOCK(vm);
 			if (vm->vm_qcache_max != 0)
 				qc_drain(vm);
 			if (vm->vm_reclaimfn != NULL)
 				vm->vm_reclaimfn(vm, flags);
 			VMEM_LOCK(vm);
 			/* If we were successful retry even NOWAIT. */
 			if (vm->vm_size - vm->vm_inuse > avail)
 				continue;
 		}
 		if ((flags & M_NOWAIT) != 0) {
 			error = ENOMEM;
 			break;
 		}
 		VMEM_CONDVAR_WAIT(vm);
 	}
 out:
 	VMEM_UNLOCK(vm);
 	if (error != 0 && (flags & M_NOWAIT) == 0)
 		panic("failed to allocate waiting allocation\n");
 
 	return (error);
 }
 
 /*
  * vmem_free: free the resource to the arena.
  */
 void
 vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
 {
 	qcache_t *qc;
 	MPASS(size > 0);
 
 	if (size <= vm->vm_qcache_max) {
 		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
 		uma_zfree(qc->qc_cache, (void *)addr);
 	} else
 		vmem_xfree(vm, addr, size);
 }
 
 void
 vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
 {
 	bt_t *bt;
 	bt_t *t;
 
 	MPASS(size > 0);
 
 	VMEM_LOCK(vm);
 	bt = bt_lookupbusy(vm, addr);
 	MPASS(bt != NULL);
 	MPASS(bt->bt_start == addr);
 	MPASS(bt->bt_size == vmem_roundup_size(vm, size) ||
 	    bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
 	MPASS(bt->bt_type == BT_TYPE_BUSY);
 	bt_rembusy(vm, bt);
 	bt->bt_type = BT_TYPE_FREE;
 
 	/* coalesce */
 	t = TAILQ_NEXT(bt, bt_seglist);
 	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
 		MPASS(BT_END(bt) < t->bt_start);	/* YYY */
 		bt->bt_size += t->bt_size;
 		bt_remfree(vm, t);
 		bt_remseg(vm, t);
 	}
 	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
 	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
 		MPASS(BT_END(t) < bt->bt_start);	/* YYY */
 		bt->bt_size += t->bt_size;
 		bt->bt_start = t->bt_start;
 		bt_remfree(vm, t);
 		bt_remseg(vm, t);
 	}
 
 	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
 	MPASS(t != NULL);
 	MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
 	if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
 	    t->bt_size == bt->bt_size) {
 		vmem_addr_t spanaddr;
 		vmem_size_t spansize;
 
 		MPASS(t->bt_start == bt->bt_start);
 		spanaddr = bt->bt_start;
 		spansize = bt->bt_size;
 		bt_remseg(vm, bt);
 		bt_remseg(vm, t);
 		vm->vm_size -= spansize;
 		VMEM_CONDVAR_BROADCAST(vm);
 		bt_freetrim(vm, BT_MAXFREE);
 		(*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
 	} else {
 		bt_insfree(vm, bt);
 		VMEM_CONDVAR_BROADCAST(vm);
 		bt_freetrim(vm, BT_MAXFREE);
 	}
 }
 
 /*
  * vmem_add:
  *
  */
 int
 vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags)
 {
 	int error;
 
 	error = 0;
 	flags &= VMEM_FLAGS;
 	VMEM_LOCK(vm);
 	if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0)
 		vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC);
 	else
 		error = ENOMEM;
 	VMEM_UNLOCK(vm);
 
 	return (error);
 }
 
 /*
  * vmem_size: information about arenas size
  */
 vmem_size_t
 vmem_size(vmem_t *vm, int typemask)
 {
 	int i;
 
 	switch (typemask) {
 	case VMEM_ALLOC:
 		return vm->vm_inuse;
 	case VMEM_FREE:
 		return vm->vm_size - vm->vm_inuse;
 	case VMEM_FREE|VMEM_ALLOC:
 		return vm->vm_size;
 	case VMEM_MAXFREE:
 		VMEM_LOCK(vm);
 		for (i = VMEM_MAXORDER - 1; i >= 0; i--) {
 			if (LIST_EMPTY(&vm->vm_freelist[i]))
 				continue;
 			VMEM_UNLOCK(vm);
 			return ((vmem_size_t)ORDER2SIZE(i) <<
 			    vm->vm_quantum_shift);
 		}
 		VMEM_UNLOCK(vm);
 		return (0);
 	default:
 		panic("vmem_size");
 	}
 }
 
 /* ---- debug */
 
 #if defined(DDB) || defined(DIAGNOSTIC)
 
 static void bt_dump(const bt_t *, int (*)(const char *, ...)
     __printflike(1, 2));
 
 static const char *
 bt_type_string(int type)
 {
 
 	switch (type) {
 	case BT_TYPE_BUSY:
 		return "busy";
 	case BT_TYPE_FREE:
 		return "free";
 	case BT_TYPE_SPAN:
 		return "span";
 	case BT_TYPE_SPAN_STATIC:
 		return "static span";
 	default:
 		break;
 	}
 	return "BOGUS";
 }
 
 static void
 bt_dump(const bt_t *bt, int (*pr)(const char *, ...))
 {
 
 	(*pr)("\t%p: %jx %jx, %d(%s)\n",
 	    bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size,
 	    bt->bt_type, bt_type_string(bt->bt_type));
 }
 
 static void
 vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2))
 {
 	const bt_t *bt;
 	int i;
 
 	(*pr)("vmem %p '%s'\n", vm, vm->vm_name);
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		bt_dump(bt, pr);
 	}
 
 	for (i = 0; i < VMEM_MAXORDER; i++) {
 		const struct vmem_freelist *fl = &vm->vm_freelist[i];
 
 		if (LIST_EMPTY(fl)) {
 			continue;
 		}
 
 		(*pr)("freelist[%d]\n", i);
 		LIST_FOREACH(bt, fl, bt_freelist) {
 			bt_dump(bt, pr);
 		}
 	}
 }
 
 #endif /* defined(DDB) || defined(DIAGNOSTIC) */
 
 #if defined(DDB)
 #include <ddb/ddb.h>
 
 static bt_t *
 vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr)
 {
 	bt_t *bt;
 
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		if (BT_ISSPAN_P(bt)) {
 			continue;
 		}
 		if (bt->bt_start <= addr && addr <= BT_END(bt)) {
 			return bt;
 		}
 	}
 
 	return NULL;
 }
 
 void
 vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...))
 {
 	vmem_t *vm;
 
 	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
 		bt_t *bt;
 
 		bt = vmem_whatis_lookup(vm, addr);
 		if (bt == NULL) {
 			continue;
 		}
 		(*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
 		    (void *)addr, (void *)bt->bt_start,
 		    (vmem_size_t)(addr - bt->bt_start), vm->vm_name,
 		    (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
 	}
 }
 
 void
 vmem_printall(const char *modif, int (*pr)(const char *, ...))
 {
 	const vmem_t *vm;
 
 	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
 		vmem_dump(vm, pr);
 	}
 }
 
 void
 vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...))
 {
 	const vmem_t *vm = (const void *)addr;
 
 	vmem_dump(vm, pr);
 }
 
 DB_SHOW_COMMAND(vmemdump, vmemdump)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show vmemdump <addr>\n");
 		return;
 	}
 
 	vmem_dump((const vmem_t *)addr, db_printf);
 }
 
 DB_SHOW_ALL_COMMAND(vmemdump, vmemdumpall)
 {
 	const vmem_t *vm;
 
 	LIST_FOREACH(vm, &vmem_list, vm_alllist)
 		vmem_dump(vm, db_printf);
 }
 
 DB_SHOW_COMMAND(vmem, vmem_summ)
 {
 	const vmem_t *vm = (const void *)addr;
 	const bt_t *bt;
 	size_t ft[VMEM_MAXORDER], ut[VMEM_MAXORDER];
 	size_t fs[VMEM_MAXORDER], us[VMEM_MAXORDER];
 	int ord;
 
 	if (!have_addr) {
 		db_printf("usage: show vmem <addr>\n");
 		return;
 	}
 
 	db_printf("vmem %p '%s'\n", vm, vm->vm_name);
 	db_printf("\tquantum:\t%zu\n", vm->vm_quantum_mask + 1);
 	db_printf("\tsize:\t%zu\n", vm->vm_size);
 	db_printf("\tinuse:\t%zu\n", vm->vm_inuse);
 	db_printf("\tfree:\t%zu\n", vm->vm_size - vm->vm_inuse);
 	db_printf("\tbusy tags:\t%d\n", vm->vm_nbusytag);
 	db_printf("\tfree tags:\t%d\n", vm->vm_nfreetags);
 
 	memset(&ft, 0, sizeof(ft));
 	memset(&ut, 0, sizeof(ut));
 	memset(&fs, 0, sizeof(fs));
 	memset(&us, 0, sizeof(us));
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		ord = SIZE2ORDER(bt->bt_size >> vm->vm_quantum_shift);
 		if (bt->bt_type == BT_TYPE_BUSY) {
 			ut[ord]++;
 			us[ord] += bt->bt_size;
 		} else if (bt->bt_type == BT_TYPE_FREE) {
 			ft[ord]++;
 			fs[ord] += bt->bt_size;
 		}
 	}
 	db_printf("\t\t\tinuse\tsize\t\tfree\tsize\n");
 	for (ord = 0; ord < VMEM_MAXORDER; ord++) {
 		if (ut[ord] == 0 && ft[ord] == 0)
 			continue;
 		db_printf("\t%-15zu %zu\t%-15zu %zu\t%-16zu\n",
 		    ORDER2SIZE(ord) << vm->vm_quantum_shift,
 		    ut[ord], us[ord], ft[ord], fs[ord]);
 	}
 }
 
 DB_SHOW_ALL_COMMAND(vmem, vmem_summall)
 {
 	const vmem_t *vm;
 
 	LIST_FOREACH(vm, &vmem_list, vm_alllist)
 		vmem_summ((db_expr_t)vm, TRUE, count, modif);
 }
 #endif /* defined(DDB) */
 
 #define vmem_printf printf
 
 #if defined(DIAGNOSTIC)
 
 static bool
 vmem_check_sanity(vmem_t *vm)
 {
 	const bt_t *bt, *bt2;
 
 	MPASS(vm != NULL);
 
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		if (bt->bt_start > BT_END(bt)) {
 			printf("corrupted tag\n");
 			bt_dump(bt, vmem_printf);
 			return false;
 		}
 	}
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
 			if (bt == bt2) {
 				continue;
 			}
 			if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
 				continue;
 			}
 			if (bt->bt_start <= BT_END(bt2) &&
 			    bt2->bt_start <= BT_END(bt)) {
 				printf("overwrapped tags\n");
 				bt_dump(bt, vmem_printf);
 				bt_dump(bt2, vmem_printf);
 				return false;
 			}
 		}
 	}
 
 	return true;
 }
 
 static void
 vmem_check(vmem_t *vm)
 {
 
 	if (!vmem_check_sanity(vm)) {
 		panic("insanity vmem %p", vm);
 	}
 }
 
 #endif /* defined(DIAGNOSTIC) */
Index: head/sys/net/bpf_jitter.c
===================================================================
--- head/sys/net/bpf_jitter.c	(revision 335067)
+++ head/sys/net/bpf_jitter.c	(revision 335068)
@@ -1,119 +1,121 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
  * Copyright (C) 2005-2017 Jung-uk Kim <jkim@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the Politecnico di Torino nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include "opt_bpf.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
 #else
 #include <stdlib.h>
 #include <sys/mman.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #endif
 
 #include <net/bpf.h>
 #include <net/bpf_jitter.h>
 
 static u_int	bpf_jit_accept_all(u_char *, u_int, u_int);
 
 #ifdef _KERNEL
 MALLOC_DEFINE(M_BPFJIT, "BPF_JIT", "BPF JIT compiler");
 
 SYSCTL_NODE(_net, OID_AUTO, bpf_jitter, CTLFLAG_RW, 0, "BPF JIT compiler");
 int bpf_jitter_enable = 1;
 SYSCTL_INT(_net_bpf_jitter, OID_AUTO, enable, CTLFLAG_RW,
     &bpf_jitter_enable, 0, "enable BPF JIT compiler");
 #endif
 
 bpf_jit_filter *
 bpf_jitter(struct bpf_insn *fp, int nins)
 {
 	bpf_jit_filter *filter;
 
 	/* Allocate the filter structure. */
 #ifdef _KERNEL
 	filter = (struct bpf_jit_filter *)malloc(sizeof(*filter),
 	    M_BPFJIT, M_NOWAIT);
 #else
 	filter = (struct bpf_jit_filter *)malloc(sizeof(*filter));
 #endif
 	if (filter == NULL)
 		return (NULL);
 
 	/* No filter means accept all. */
 	if (fp == NULL || nins == 0) {
 		filter->func = bpf_jit_accept_all;
 		return (filter);
 	}
 
 	/* Create the binary. */
 	if ((filter->func = bpf_jit_compile(fp, nins, &filter->size)) == NULL) {
 #ifdef _KERNEL
 		free(filter, M_BPFJIT);
 #else
 		free(filter);
 #endif
 		return (NULL);
 	}
 
 	return (filter);
 }
 
 void
 bpf_destroy_jit_filter(bpf_jit_filter *filter)
 {
 
-	if (filter->func != bpf_jit_accept_all)
-		bpf_jit_free(filter->func, filter->size);
 #ifdef _KERNEL
+	if (filter->func != bpf_jit_accept_all)
+		free(filter->func, M_BPFJIT);
 	free(filter, M_BPFJIT);
 #else
+	if (filter->func != bpf_jit_accept_all)
+		munmap(filter->func, filter->size);
 	free(filter);
 #endif
 }
 
 static u_int
 bpf_jit_accept_all(__unused u_char *p, __unused u_int wirelen,
     __unused u_int buflen)
 {
 
 	return ((u_int)-1);
 }
Index: head/sys/net/bpf_jitter.h
===================================================================
--- head/sys/net/bpf_jitter.h	(revision 335067)
+++ head/sys/net/bpf_jitter.h	(revision 335068)
@@ -1,93 +1,92 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
  * Copyright (C) 2005-2009 Jung-uk Kim <jkim@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the Politecnico di Torino nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NET_BPF_JITTER_H_
 #define _NET_BPF_JITTER_H_
 
 #ifdef _KERNEL
 MALLOC_DECLARE(M_BPFJIT);
 #endif
 
 extern int bpf_jitter_enable;
 
 /*
  * Prototype of a filtering function created by the jitter.
  *
  * The syntax and the meaning of the parameters is analogous to the one of
  * bpf_filter(). Notice that the filter is not among the parameters because
  * it is hardwired in the function.
  */
 typedef u_int (*bpf_filter_func)(u_char *, u_int, u_int);
 
 /* Structure describing a native filtering program created by the jitter. */
 typedef struct bpf_jit_filter {
 	/* The native filtering binary, in the form of a bpf_filter_func. */
 	bpf_filter_func	func;
 	size_t		size;
 } bpf_jit_filter;
 
 /*
  * BPF jitter, builds a machine function from a BPF program.
  *
  * param fp	The BPF pseudo-assembly filter that will be translated
  *		into native code.
  * param nins	Number of instructions of the input filter.
  * return	The bpf_jit_filter structure containing the native filtering
  *		binary.
  *
  * bpf_jitter allocates the buffers for the new native filter and
  * then translates the program pointed by fp calling bpf_jit_compile().
  */
 bpf_jit_filter	*bpf_jitter(struct bpf_insn *fp, int nins);
 
 /*
  * Deletes a filtering function that was previously created by bpf_jitter().
  *
  * param filter	The filter to destroy.
  *
  * This function frees the variuos buffers (code, memory, etc.) associated
  * with a filtering function.
  */
 void		bpf_destroy_jit_filter(bpf_jit_filter *filter);
 
 /*
  * Declarations for machine-dependent functions.
  */
 struct bpf_insn;
 
 bpf_filter_func	bpf_jit_compile(struct bpf_insn *, u_int, size_t *);
-void		bpf_jit_free(void *, size_t);
 
 #endif	/* _NET_BPF_JITTER_H_ */
Index: head/sys/sys/malloc.h
===================================================================
--- head/sys/sys/malloc.h	(revision 335067)
+++ head/sys/sys/malloc.h	(revision 335068)
@@ -1,274 +1,275 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1987, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005, 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)malloc.h	8.5 (Berkeley) 5/3/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MALLOC_H_
 #define	_SYS_MALLOC_H_
 
 #include <sys/param.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #endif
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <machine/_limits.h>
 
 #define	MINALLOCSIZE	UMA_SMALLEST_UNIT
 
 /*
- * flags to malloc.
+ * Flags to memory allocation functions.
  */
 #define	M_NOWAIT	0x0001		/* do not block */
 #define	M_WAITOK	0x0002		/* ok to block */
 #define	M_ZERO		0x0100		/* bzero the allocation */
 #define	M_NOVM		0x0200		/* don't ask VM for pages */
 #define	M_USE_RESERVE	0x0400		/* can alloc out of reserve memory */
 #define	M_NODUMP	0x0800		/* don't dump pages in this allocation */
 #define	M_FIRSTFIT	0x1000		/* Only for vmem, fast fit. */
 #define	M_BESTFIT	0x2000		/* Only for vmem, low fragmentation. */
+#define	M_EXEC		0x4000		/* allocate executable space. */
 
 #define	M_MAGIC		877983977	/* time when first defined :-) */
 
 #ifdef INVARIANTS
 #define	M_ZERO_INVARIANTS		M_ZERO
 #else
 #define	M_ZERO_INVARIANTS		0
 #endif
 
 
 /*
  * Two malloc type structures are present: malloc_type, which is used by a
  * type owner to declare the type, and malloc_type_internal, which holds
  * malloc-owned statistics and other ABI-sensitive fields, such as the set of
  * malloc statistics indexed by the compile-time MAXCPU constant.
  * Applications should avoid introducing dependence on the allocator private
  * data layout and size.
  *
  * The malloc_type ks_next field is protected by malloc_mtx.  Other fields in
  * malloc_type are static after initialization so unsynchronized.
  *
  * Statistics in malloc_type_stats are written only when holding a critical
  * section and running on the CPU associated with the index into the stat
  * array, but read lock-free resulting in possible (minor) races, which the
  * monitoring app should take into account.
  */
 struct malloc_type_stats {
 	uint64_t	mts_memalloced;	/* Bytes allocated on CPU. */
 	uint64_t	mts_memfreed;	/* Bytes freed on CPU. */
 	uint64_t	mts_numallocs;	/* Number of allocates on CPU. */
 	uint64_t	mts_numfrees;	/* number of frees on CPU. */
 	uint64_t	mts_size;	/* Bitmask of sizes allocated on CPU. */
 	uint64_t	_mts_reserved1;	/* Reserved field. */
 	uint64_t	_mts_reserved2;	/* Reserved field. */
 	uint64_t	_mts_reserved3;	/* Reserved field. */
 };
 
 /*
  * Index definitions for the mti_probes[] array.
  */
 #define DTMALLOC_PROBE_MALLOC		0
 #define DTMALLOC_PROBE_FREE		1
 #define DTMALLOC_PROBE_MAX		2
 
 struct malloc_type_internal {
 	uint32_t	mti_probes[DTMALLOC_PROBE_MAX];
 					/* DTrace probe ID array. */
 	u_char		mti_zone;
 	struct malloc_type_stats	mti_stats[MAXCPU];
 };
 
 /*
  * Public data structure describing a malloc type.  Private data is hung off
  * of ks_handle to avoid encoding internal malloc(9) data structures in
  * modules, which will statically allocate struct malloc_type.
  */
 struct malloc_type {
 	struct malloc_type *ks_next;	/* Next in global chain. */
 	u_long		 ks_magic;	/* Detect programmer error. */
 	const char	*ks_shortdesc;	/* Printable type name. */
 	void		*ks_handle;	/* Priv. data, was lo_class. */
 };
 
 /*
  * Statistics structure headers for user space.  The kern.malloc sysctl
  * exposes a structure stream consisting of a stream header, then a series of
  * malloc type headers and statistics structures (quantity maxcpus).  For
  * convenience, the kernel will provide the current value of maxcpus at the
  * head of the stream.
  */
 #define	MALLOC_TYPE_STREAM_VERSION	0x00000001
 struct malloc_type_stream_header {
 	uint32_t	mtsh_version;	/* Stream format version. */
 	uint32_t	mtsh_maxcpus;	/* Value of MAXCPU for stream. */
 	uint32_t	mtsh_count;	/* Number of records. */
 	uint32_t	_mtsh_pad;	/* Pad/reserved field. */
 };
 
 #define	MALLOC_MAX_NAME	32
 struct malloc_type_header {
 	char				mth_name[MALLOC_MAX_NAME];
 };
 
 #ifdef _KERNEL
 #define	MALLOC_DEFINE(type, shortdesc, longdesc)			\
 	struct malloc_type type[1] = {					\
 		{ NULL, M_MAGIC, shortdesc, NULL }			\
 	};								\
 	SYSINIT(type##_init, SI_SUB_KMEM, SI_ORDER_THIRD, malloc_init,	\
 	    type);							\
 	SYSUNINIT(type##_uninit, SI_SUB_KMEM, SI_ORDER_ANY,		\
 	    malloc_uninit, type)
 
 #define	MALLOC_DECLARE(type) \
 	extern struct malloc_type type[1]
 
 MALLOC_DECLARE(M_CACHE);
 MALLOC_DECLARE(M_DEVBUF);
 MALLOC_DECLARE(M_TEMP);
 
 /*
  * XXX this should be declared in <sys/uio.h>, but that tends to fail
  * because <sys/uio.h> is included in a header before the source file
  * has a chance to include <sys/malloc.h> to get MALLOC_DECLARE() defined.
  */
 MALLOC_DECLARE(M_IOV);
 
 extern struct mtx malloc_mtx;
 
 /*
  * Function type used when iterating over the list of malloc types.
  */
 typedef void malloc_type_list_func_t(struct malloc_type *, void *);
 
 void	contigfree(void *addr, unsigned long size, struct malloc_type *type);
 void	*contigmalloc(unsigned long size, struct malloc_type *type, int flags,
 	    vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
 	    vm_paddr_t boundary) __malloc_like __result_use_check
 	    __alloc_size(1) __alloc_align(6);
 void	*contigmalloc_domain(unsigned long size, struct malloc_type *type,
 	    int domain, int flags, vm_paddr_t low, vm_paddr_t high,
 	    unsigned long alignment, vm_paddr_t boundary)
 	    __malloc_like __result_use_check __alloc_size(1) __alloc_align(6);
 void	free(void *addr, struct malloc_type *type);
 void	free_domain(void *addr, struct malloc_type *type);
 void	*malloc(size_t size, struct malloc_type *type, int flags) __malloc_like
 	    __result_use_check __alloc_size(1);
 /*
  * Try to optimize malloc(..., ..., M_ZERO) allocations by doing zeroing in
  * place if the size is known at compilation time.
  *
  * Passing the flag down requires malloc to blindly zero the entire object.
  * In practice a lot of the zeroing can be avoided if most of the object
  * gets explicitly initialized after the allocation. Letting the compiler
  * zero in place gives it the opportunity to take advantage of this state.
  *
  * Note that the operation is only applicable if both flags and size are
  * known at compilation time. If M_ZERO is passed but M_WAITOK is not, the
  * allocation can fail and a NULL check is needed. However, if M_WAITOK is
  * passed we know the allocation must succeed and the check can be elided.
  *
  *	_malloc_item = malloc(_size, type, (flags) &~ M_ZERO);
  *	if (((flags) & M_WAITOK) != 0 || _malloc_item != NULL)
  *		bzero(_malloc_item, _size);
  *
  * If the flag is set, the compiler knows the left side is always true,
  * therefore the entire statement is true and the callsite is:
  *
  *	_malloc_item = malloc(_size, type, (flags) &~ M_ZERO);
  *	bzero(_malloc_item, _size);
  *
  * If the flag is not set, the compiler knows the left size is always false
  * and the NULL check is needed, therefore the callsite is:
  *
  * 	_malloc_item = malloc(_size, type, (flags) &~ M_ZERO);
  *	if (_malloc_item != NULL)
  *		bzero(_malloc_item, _size);			
  *
  * The implementation is a macro because of what appears to be a clang 6 bug:
  * an inline function variant ended up being compiled to a mere malloc call
  * regardless of argument. gcc generates expected code (like the above).
  */
 #ifdef _KERNEL
 #define	malloc(size, type, flags) ({					\
 	void *_malloc_item;						\
 	size_t _size = (size);						\
 	if (__builtin_constant_p(size) && __builtin_constant_p(flags) &&\
 	    ((flags) & M_ZERO) != 0) {					\
 		_malloc_item = malloc(_size, type, (flags) &~ M_ZERO);	\
 		if (((flags) & M_WAITOK) != 0 ||			\
 		    __predict_true(_malloc_item != NULL))		\
 			bzero(_malloc_item, _size);			\
 	} else {							\
 		_malloc_item = malloc(_size, type, flags);		\
 	}								\
 	_malloc_item;							\
 })
 #endif
 
 void	*malloc_domain(size_t size, struct malloc_type *type, int domain,
 	    int flags) __malloc_like __result_use_check __alloc_size(1);
 void	*mallocarray(size_t nmemb, size_t size, struct malloc_type *type,
 	    int flags) __malloc_like __result_use_check
 	    __alloc_size2(1, 2);
 void	malloc_init(void *);
 int	malloc_last_fail(void);
 void	malloc_type_allocated(struct malloc_type *type, unsigned long size);
 void	malloc_type_freed(struct malloc_type *type, unsigned long size);
 void	malloc_type_list(malloc_type_list_func_t *, void *);
 void	malloc_uninit(void *);
 void	*realloc(void *addr, size_t size, struct malloc_type *type, int flags)
 	    __result_use_check __alloc_size(2);
 void	*reallocf(void *addr, size_t size, struct malloc_type *type, int flags)
 	    __result_use_check __alloc_size(2);
 
 struct malloc_type *malloc_desc2type(const char *desc);
 
 /*
  * This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX
  * if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW
  */
 #define MUL_NO_OVERFLOW		(1UL << (sizeof(size_t) * 8 / 2))
 static inline bool
 WOULD_OVERFLOW(size_t nmemb, size_t size)
 {
 
 	return ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
 	    nmemb > 0 && __SIZE_T_MAX / nmemb < size);
 }
 #undef MUL_NO_OVERFLOW
 #endif /* _KERNEL */
 
 #endif /* !_SYS_MALLOC_H_ */
Index: head/sys/vm/uma.h
===================================================================
--- head/sys/vm/uma.h	(revision 335067)
+++ head/sys/vm/uma.h	(revision 335068)
@@ -1,717 +1,718 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 /*
  * uma.h - External definitions for the Universal Memory Allocator
  *
 */
 
 #ifndef _VM_UMA_H_
 #define _VM_UMA_H_
 
 #include <sys/param.h>		/* For NULL */
 #include <sys/malloc.h>		/* For M_* */
 
 /* User visible parameters */
 #define UMA_SMALLEST_UNIT       (PAGE_SIZE / 256) /* Smallest item allocated */
 
 /* Types and type defs */
 
 struct uma_zone;
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;
 
 void zone_drain(uma_zone_t);
 
 /*
  * Item constructor
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	arg   The arg field passed to uma_zalloc_arg
  *	size  The size of the allocated item
  *	flags See zalloc flags
  *
  * Returns:
  *	0      on success
  *      errno  on failure
  *
  * Discussion:
  *	The constructor is called just before the memory is returned
  *	to the user. It may block if necessary.
  */
 typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags);
 
 /*
  * Item destructor
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	size  The size of the item being destructed.
  *	arg   Argument passed through uma_zfree_arg
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	The destructor may perform operations that differ from those performed
  *	by the initializer, but it must leave the object in the same state.
  *	This IS type stable storage.  This is called after EVERY zfree call.
  */
 typedef void (*uma_dtor)(void *mem, int size, void *arg);
 
 /*
  * Item initializer
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	size  The size of the item being initialized.
  *	flags See zalloc flags
  *
  * Returns:
  *	0      on success
  *      errno  on failure
  *
  * Discussion:
  *	The initializer is called when the memory is cached in the uma zone.
  *	The initializer and the destructor should leave the object in the same
  *	state.
  */
 typedef int (*uma_init)(void *mem, int size, int flags);
 
 /*
  * Item discard function
  *
  * Arguments:
  *	item  A pointer to memory which has been 'freed' but has not left the
  *	      zone's cache.
  *	size  The size of the item being discarded.
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	This routine is called when memory leaves a zone and is returned to the
  *	system for other uses.  It is the counter-part to the init function.
  */
 typedef void (*uma_fini)(void *mem, int size);
 
 /*
  * Import new memory into a cache zone.
  */
 typedef int (*uma_import)(void *arg, void **store, int count, int domain,
     int flags);
 
 /*
  * Free memory from a cache zone.
  */
 typedef void (*uma_release)(void *arg, void **store, int count);
 
 /*
  * What's the difference between initializing and constructing?
  *
  * The item is initialized when it is cached, and this is the state that the
  * object should be in when returned to the allocator. The purpose of this is
  * to remove some code which would otherwise be called on each allocation by
  * utilizing a known, stable state.  This differs from the constructor which
  * will be called on EVERY allocation.
  *
  * For example, in the initializer you may want to initialize embedded locks,
  * NULL list pointers, set up initial states, magic numbers, etc.  This way if
  * the object is held in the allocator and re-used it won't be necessary to
  * re-initialize it.
  *
  * The constructor may be used to lock a data structure, link it on to lists,
  * bump reference counts or total counts of outstanding structures, etc.
  *
  */
 
 
 /* Function proto types */
 
 /*
  * Create a new uma zone
  *
  * Arguments:
  *	name  The text name of the zone for debugging and stats. This memory
  *		should not be freed until the zone has been deallocated.
  *	size  The size of the object that is being created.
  *	ctor  The constructor that is called when the object is allocated.
  *	dtor  The destructor that is called when the object is freed.
  *	init  An initializer that sets up the initial state of the memory.
  *	fini  A discard function that undoes initialization done by init.
  *		ctor/dtor/init/fini may all be null, see notes above.
  *	align A bitmask that corresponds to the requested alignment
  *		eg 4 would be 0x3
  *	flags A set of parameters that control the behavior of the zone.
  *
  * Returns:
  *	A pointer to a structure which is intended to be opaque to users of
  *	the interface.  The value may be null if the wait flag is not set.
  */
 uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor,
 		    uma_dtor dtor, uma_init uminit, uma_fini fini,
 		    int align, uint32_t flags);
 
 /*
  * Create a secondary uma zone
  *
  * Arguments:
  *	name  The text name of the zone for debugging and stats. This memory
  *		should not be freed until the zone has been deallocated.
  *	ctor  The constructor that is called when the object is allocated.
  *	dtor  The destructor that is called when the object is freed.
  *	zinit  An initializer that sets up the initial state of the memory
  *		as the object passes from the Keg's slab to the Zone's cache.
  *	zfini  A discard function that undoes initialization done by init
  *		as the object passes from the Zone's cache to the Keg's slab.
  *
  *		ctor/dtor/zinit/zfini may all be null, see notes above.
  *		Note that the zinit and zfini specified here are NOT
  *		exactly the same as the init/fini specified to uma_zcreate()
  *		when creating a master zone.  These zinit/zfini are called
  *		on the TRANSITION from keg to zone (and vice-versa). Once
  *		these are set, the primary zone may alter its init/fini
  *		(which are called when the object passes from VM to keg)
  *		using uma_zone_set_init/fini()) as well as its own
  *		zinit/zfini (unset by default for master zone) with
  *		uma_zone_set_zinit/zfini() (note subtle 'z' prefix).
  *
  *	master  A reference to this zone's Master Zone (Primary Zone),
  *		which contains the backing Keg for the Secondary Zone
  *		being added.
  *
  * Returns:
  *	A pointer to a structure which is intended to be opaque to users of
  *	the interface.  The value may be null if the wait flag is not set.
  */
 uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_zone_t master);
 
 /*
  * Add a second master to a secondary zone.  This provides multiple data
  * backends for objects with the same size.  Both masters must have
  * compatible allocation flags.  Presently, UMA_ZONE_MALLOC type zones are
  * the only supported.
  *
  * Returns:
  *	Error on failure, 0 on success.
  */
 int uma_zsecond_add(uma_zone_t zone, uma_zone_t master);
 
 /*
  * Create cache-only zones.
  *
  * This allows uma's per-cpu cache facilities to handle arbitrary
  * pointers.  Consumers must specify the import and release functions to
  * fill and destroy caches.  UMA does not allocate any memory for these
  * zones.  The 'arg' parameter is passed to import/release and is caller
  * specific.
  */
 uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_import zimport,
 		    uma_release zrelease, void *arg, int flags);
 
 /*
  * Definitions for uma_zcreate flags
  *
  * These flags share space with UMA_ZFLAGs in uma_int.h.  Be careful not to
  * overlap when adding new features.  0xff000000 is in use by uma_int.h.
  */
 #define UMA_ZONE_PAGEABLE	0x0001	/* Return items not fully backed by
 					   physical memory XXX Not yet */
 #define UMA_ZONE_ZINIT		0x0002	/* Initialize with zeros */
 #define UMA_ZONE_STATIC		0x0004	/* Statically sized zone */
 #define UMA_ZONE_OFFPAGE	0x0008	/* Force the slab structure allocation
 					   off of the real memory */
 #define UMA_ZONE_MALLOC		0x0010	/* For use by malloc(9) only! */
 #define UMA_ZONE_NOFREE		0x0020	/* Do not free slabs of this type! */
 #define UMA_ZONE_MTXCLASS	0x0040	/* Create a new lock class */
 #define	UMA_ZONE_VM		0x0080	/*
 					 * Used for internal vm datastructures
 					 * only.
 					 */
 #define	UMA_ZONE_HASH		0x0100	/*
 					 * Use a hash table instead of caching
 					 * information in the vm_page.
 					 */
 #define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
 #define	UMA_ZONE_NOBUCKET	0x0400	/* Do not use buckets. */
 #define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets. */
 #define	UMA_ZONE_CACHESPREAD	0x1000	/*
 					 * Spread memory start locations across
 					 * all possible cache lines.  May
 					 * require many virtually contiguous
 					 * backend pages and can fail early.
 					 */
 #define	UMA_ZONE_VTOSLAB	0x2000	/* Zone uses vtoslab for lookup. */
 #define	UMA_ZONE_NODUMP		0x4000	/*
 					 * Zone's pages will not be included in
 					 * mini-dumps.
 					 */
 #define	UMA_ZONE_PCPU		0x8000	/*
 					 * Allocates mp_maxid + 1 slabs sized to
 					 * sizeof(struct pcpu).
 					 */
 #define	UMA_ZONE_NUMA		0x10000	/*
 					 * NUMA aware Zone.  Implements a best
 					 * effort first-touch policy.
 					 */
 #define	UMA_ZONE_NOBUCKETCACHE	0x20000	/*
 					 * Don't cache full buckets.  Limit
 					 * UMA to per-cpu state.
 					 */
 
 /*
  * These flags are shared between the keg and zone.  In zones wishing to add
  * new kegs these flags must be compatible.  Some are determined based on
  * physical parameters of the request and may not be provided by the consumer.
  */
 #define	UMA_ZONE_INHERIT						\
     (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE |		\
     UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
 #define UMA_ALIGN_LONG	(sizeof(long) - 1)	/* "" long */
 #define UMA_ALIGN_INT	(sizeof(int) - 1)	/* "" int */
 #define UMA_ALIGN_SHORT	(sizeof(short) - 1)	/* "" short */
 #define UMA_ALIGN_CHAR	(sizeof(char) - 1)	/* "" char */
 #define UMA_ALIGN_CACHE	(0 - 1)			/* Cache line size align */
 #define	UMA_ALIGNOF(type) (_Alignof(type) - 1)	/* Alignment fit for 'type' */
 
 /*
  * Destroys an empty uma zone.  If the zone is not empty uma complains loudly.
  *
  * Arguments:
  *	zone  The zone we want to destroy.
  *
  */
 void uma_zdestroy(uma_zone_t zone);
 
 /*
  * Allocates an item out of a zone
  *
  * Arguments:
  *	zone  The zone we are allocating from
  *	arg   This data is passed to the ctor function
  *	flags See sys/malloc.h for available flags.
  *
  * Returns:
  *	A non-null pointer to an initialized element from the zone is
  *	guaranteed if the wait flag is M_WAITOK.  Otherwise a null pointer
  *	may be returned if the zone is empty or the ctor failed.
  */
 
 void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags);
 void *uma_zalloc_pcpu_arg(uma_zone_t zone, void *arg, int flags);
 
 /*
  * Allocate an item from a specific NUMA domain.  This uses a slow path in
  * the allocator but is guaranteed to allocate memory from the requested
  * domain if M_WAITOK is set.
  *
  * Arguments:
  *	zone  The zone we are allocating from
  *	arg   This data is passed to the ctor function
  *	domain The domain to allocate from.
  *	flags See sys/malloc.h for available flags.
  */
 void *uma_zalloc_domain(uma_zone_t zone, void *arg, int domain, int flags);
 
 /*
  * Allocates an item out of a zone without supplying an argument
  *
  * This is just a wrapper for uma_zalloc_arg for convenience.
  *
  */
 static __inline void *uma_zalloc(uma_zone_t zone, int flags);
 static __inline void *uma_zalloc_pcpu(uma_zone_t zone, int flags);
 
 static __inline void *
 uma_zalloc(uma_zone_t zone, int flags)
 {
 	return uma_zalloc_arg(zone, NULL, flags);
 }
 
 static __inline void *
 uma_zalloc_pcpu(uma_zone_t zone, int flags)
 {
 	return uma_zalloc_pcpu_arg(zone, NULL, flags);
 }
 
 /*
  * Frees an item back into the specified zone.
  *
  * Arguments:
  *	zone  The zone the item was originally allocated out of.
  *	item  The memory to be freed.
  *	arg   Argument passed to the destructor
  *
  * Returns:
  *	Nothing.
  */
 
 void uma_zfree_arg(uma_zone_t zone, void *item, void *arg);
 void uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *arg);
 
 /*
  * Frees an item back to the specified zone's domain specific pool.
  *
  * Arguments:
  *	zone  The zone the item was originally allocated out of.
  *	item  The memory to be freed.
  *	arg   Argument passed to the destructor
  */
 void uma_zfree_domain(uma_zone_t zone, void *item, void *arg);
 
 /*
  * Frees an item back to a zone without supplying an argument
  *
  * This is just a wrapper for uma_zfree_arg for convenience.
  *
  */
 static __inline void uma_zfree(uma_zone_t zone, void *item);
 static __inline void uma_zfree_pcpu(uma_zone_t zone, void *item);
 
 static __inline void
 uma_zfree(uma_zone_t zone, void *item)
 {
 	uma_zfree_arg(zone, item, NULL);
 }
 
 static __inline void
 uma_zfree_pcpu(uma_zone_t zone, void *item)
 {
 	uma_zfree_pcpu_arg(zone, item, NULL);
 }
 
 /*
  * Wait until the specified zone can allocate an item.
  */
 void uma_zwait(uma_zone_t zone);
 
 /*
  * Backend page supplier routines
  *
  * Arguments:
  *	zone  The zone that is requesting pages.
  *	size  The number of bytes being requested.
  *	pflag Flags for these memory pages, see below.
  *	domain The NUMA domain that we prefer for this allocation.
  *	wait  Indicates our willingness to block.
  *
  * Returns:
  *	A pointer to the allocated memory or NULL on failure.
  */
 
 typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain,
     uint8_t *pflag, int wait);
 
 /*
  * Backend page free routines
  *
  * Arguments:
  *	item  A pointer to the previously allocated pages.
  *	size  The original size of the allocation.
  *	pflag The flags for the slab.  See UMA_SLAB_* below.
  *
  * Returns:
  *	None
  */
 typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
 
 /*
  * Reclaims unused memory for all zones
  *
  * Arguments:
  *	None
  * Returns:
  *	None
  *
  * This should only be called by the page out daemon.
  */
 
 void uma_reclaim(void);
 
 /*
  * Sets the alignment mask to be used for all zones requesting cache
  * alignment.  Should be called by MD boot code prior to starting VM/UMA.
  *
  * Arguments:
  *	align The alignment mask
  *
  * Returns:
  *	Nothing
  */
 void uma_set_align(int align);
 
 /*
  * Set a reserved number of items to hold for M_USE_RESERVE allocations.  All
  * other requests must allocate new backing pages.
  */
 void uma_zone_reserve(uma_zone_t zone, int nitems);
 
 /*
  * Reserves the maximum KVA space required by the zone and configures the zone
  * to use a VM_ALLOC_NOOBJ-based backend allocator.
  *
  * Arguments:
  *	zone  The zone to update.
  *	nitems  The upper limit on the number of items that can be allocated.
  *
  * Returns:
  *	0  if KVA space can not be allocated
  *	1  if successful
  *
  * Discussion:
  *	When the machine supports a direct map and the zone's items are smaller
  *	than a page, the zone will use the direct map instead of allocating KVA
  *	space.
  */
 int uma_zone_reserve_kva(uma_zone_t zone, int nitems);
 
 /*
  * Sets a high limit on the number of items allowed in a zone
  *
  * Arguments:
  *	zone  The zone to limit
  *	nitems  The requested upper limit on the number of items allowed
  *
  * Returns:
  *	int  The effective value of nitems after rounding up based on page size
  */
 int uma_zone_set_max(uma_zone_t zone, int nitems);
 
 /*
  * Obtains the effective limit on the number of items in a zone
  *
  * Arguments:
  *	zone  The zone to obtain the effective limit from
  *
  * Return:
  *	0  No limit
  *	int  The effective limit of the zone
  */
 int uma_zone_get_max(uma_zone_t zone);
 
 /*
  * Sets a warning to be printed when limit is reached
  *
  * Arguments:
  *	zone  The zone we will warn about
  *	warning  Warning content
  *
  * Returns:
  *	Nothing
  */
 void uma_zone_set_warning(uma_zone_t zone, const char *warning);
 
 /*
  * Sets a function to run when limit is reached
  *
  * Arguments:
  *	zone  The zone to which this applies
  *	fx  The function ro run
  *
  * Returns:
  *	Nothing
  */
 typedef void (*uma_maxaction_t)(uma_zone_t, int);
 void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t);
 
 /*
  * Obtains the approximate current number of items allocated from a zone
  *
  * Arguments:
  *	zone  The zone to obtain the current allocation count from
  *
  * Return:
  *	int  The approximate current number of items allocated from the zone
  */
 int uma_zone_get_cur(uma_zone_t zone);
 
 /*
  * The following two routines (uma_zone_set_init/fini)
  * are used to set the backend init/fini pair which acts on an
  * object as it becomes allocated and is placed in a slab within
  * the specified zone's backing keg.  These should probably not
  * be changed once allocations have already begun, but only be set
  * immediately upon zone creation.
  */
 void uma_zone_set_init(uma_zone_t zone, uma_init uminit);
 void uma_zone_set_fini(uma_zone_t zone, uma_fini fini);
 
 /*
  * The following two routines (uma_zone_set_zinit/zfini) are
  * used to set the zinit/zfini pair which acts on an object as
  * it passes from the backing Keg's slab cache to the
  * specified Zone's bucket cache.  These should probably not
  * be changed once allocations have already begun, but only be set
  * immediately upon zone creation.
  */
 void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit);
 void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
 
 /*
  * Replaces the standard backend allocator for this zone.
  *
  * Arguments:
  *	zone   The zone whose backend allocator is being changed.
  *	allocf A pointer to the allocation function
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	This could be used to implement pageable allocation, or perhaps
  *	even DMA allocators if used in conjunction with the OFFPAGE
  *	zone flag.
  */
 
 void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf);
 
 /*
  * Used for freeing memory provided by the allocf above
  *
  * Arguments:
  *	zone  The zone that intends to use this free routine.
  *	freef The page freeing routine.
  *
  * Returns:
  *	Nothing
  */
 
 void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
 
 /*
  * These flags are setable in the allocf and visible in the freef.
  */
 #define UMA_SLAB_BOOT	0x01		/* Slab alloced from boot pages */
+#define UMA_SLAB_KRWX	0x02		/* Slab alloced from kernel_rwx_arena */
 #define UMA_SLAB_KERNEL	0x04		/* Slab alloced from kernel_map */
 #define UMA_SLAB_PRIV	0x08		/* Slab alloced from priv allocator */
 #define UMA_SLAB_OFFP	0x10		/* Slab is managed separately  */
 #define UMA_SLAB_MALLOC	0x20		/* Slab is a large malloc slab */
-/* 0x02, 0x40 and 0x80 are available */
+/* 0x40 and 0x80 are available */
 
 /*
  * Used to pre-fill a zone with some number of items
  *
  * Arguments:
  *	zone    The zone to fill
  *	itemcnt The number of items to reserve
  *
  * Returns:
  *	Nothing
  *
  * NOTE: This is blocking and should only be done at startup
  */
 void uma_prealloc(uma_zone_t zone, int itemcnt);
 
 /*
  * Used to determine if a fixed-size zone is exhausted.
  *
  * Arguments:
  *	zone    The zone to check
  *
  * Returns:
  *	Non-zero if zone is exhausted.
  */
 int uma_zone_exhausted(uma_zone_t zone);
 int uma_zone_exhausted_nolock(uma_zone_t zone);
 
 /*
  * Common UMA_ZONE_PCPU zones.
  */
 extern uma_zone_t pcpu_zone_64;
 extern uma_zone_t pcpu_zone_ptr;
 
 /*
  * Exported statistics structures to be used by user space monitoring tools.
  * Statistics stream consists of a uma_stream_header, followed by a series of
  * alternative uma_type_header and uma_type_stat structures.
  */
 #define	UMA_STREAM_VERSION	0x00000001
 struct uma_stream_header {
 	uint32_t	ush_version;	/* Stream format version. */
 	uint32_t	ush_maxcpus;	/* Value of MAXCPU for stream. */
 	uint32_t	ush_count;	/* Number of records. */
 	uint32_t	_ush_pad;	/* Pad/reserved field. */
 };
 
 #define	UTH_MAX_NAME	32
 #define	UTH_ZONE_SECONDARY	0x00000001
 struct uma_type_header {
 	/*
 	 * Static per-zone data, some extracted from the supporting keg.
 	 */
 	char		uth_name[UTH_MAX_NAME];
 	uint32_t	uth_align;	/* Keg: alignment. */
 	uint32_t	uth_size;	/* Keg: requested size of item. */
 	uint32_t	uth_rsize;	/* Keg: real size of item. */
 	uint32_t	uth_maxpages;	/* Keg: maximum number of pages. */
 	uint32_t	uth_limit;	/* Keg: max items to allocate. */
 
 	/*
 	 * Current dynamic zone/keg-derived statistics.
 	 */
 	uint32_t	uth_pages;	/* Keg: pages allocated. */
 	uint32_t	uth_keg_free;	/* Keg: items free. */
 	uint32_t	uth_zone_free;	/* Zone: items free. */
 	uint32_t	uth_bucketsize;	/* Zone: desired bucket size. */
 	uint32_t	uth_zone_flags;	/* Zone: flags. */
 	uint64_t	uth_allocs;	/* Zone: number of allocations. */
 	uint64_t	uth_frees;	/* Zone: number of frees. */
 	uint64_t	uth_fails;	/* Zone: number of alloc failures. */
 	uint64_t	uth_sleeps;	/* Zone: number of alloc sleeps. */
 	uint64_t	_uth_reserved1[2];	/* Reserved. */
 };
 
 struct uma_percpu_stat {
 	uint64_t	ups_allocs;	/* Cache: number of allocations. */
 	uint64_t	ups_frees;	/* Cache: number of frees. */
 	uint64_t	ups_cache_free;	/* Cache: free items in cache. */
 	uint64_t	_ups_reserved[5];	/* Reserved. */
 };
 
 void uma_reclaim_wakeup(void);
 void uma_reclaim_worker(void *);
 
 unsigned long uma_limit(void);
 
 /* Return the amount of memory managed by UMA. */
 unsigned long uma_size(void);
 
 /* Return the amount of memory remaining.  May be negative. */
 long uma_avail(void);
 
 #endif	/* _VM_UMA_H_ */
Index: head/sys/vm/uma_core.c
===================================================================
--- head/sys/vm/uma_core.c	(revision 335067)
+++ head/sys/vm/uma_core.c	(revision 335068)
@@ -1,4129 +1,4153 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * uma_core.c  Implementation of the Universal Memory allocator
  *
  * This allocator is intended to replace the multitude of similar object caches
  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  * efficient.  A primary design goal is to return unused memory to the rest of
  * the system.  This will make the system as a whole more flexible due to the
  * ability to move memory to subsystems which most need it instead of leaving
  * pools of reserved memory unused.
  *
  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  * are well known.
  *
  */
 
 /*
  * TODO:
  *	- Improve memory usage for large allocations
  *	- Investigate cache size adjustments
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_param.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bitset.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/limits.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #include <ddb/ddb.h>
 
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
 
 /*
  * This is the zone and keg from which all zones are spawned.
  */
 static uma_zone_t kegs;
 static uma_zone_t zones;
 
 /* This is the zone from which all offpage uma_slab_ts are allocated. */
 static uma_zone_t slabzone;
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
  * prior to malloc coming up.
  */
 static uma_zone_t hashzone;
 
 /* The boot-time adjusted value for cache line alignment. */
 int uma_align_cache = 64 - 1;
 
 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 
 /*
  * Are we allowed to allocate buckets?
  */
 static int bucketdisable = 1;
 
 /* Linked list of all kegs in the system */
 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 
 /* Linked list of all cache-only zones in the system */
 static LIST_HEAD(,uma_zone) uma_cachezones =
     LIST_HEAD_INITIALIZER(uma_cachezones);
 
 /* This RW lock protects the keg list */
 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 
 /*
  * Pointer and counter to pool of pages, that is preallocated at
  * startup to bootstrap UMA.
  */
 static char *bootmem;
 static int boot_pages;
 
 static struct sx uma_drain_lock;
 
 /* kmem soft limit. */
 static unsigned long uma_kmem_limit = LONG_MAX;
 static volatile unsigned long uma_kmem_total;
 
 /* Is the VM done starting up? */
 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
     BOOT_RUNNING } booted = BOOT_COLD;
 
 /*
  * This is the handle used to schedule events that need to happen
  * outside of the allocation fast path.
  */
 static struct callout uma_callout;
 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
 
 /*
  * This structure is passed as the zone ctor arg so that I don't have to create
  * a special allocation function just for zones.
  */
 struct uma_zctor_args {
 	const char *name;
 	size_t size;
 	uma_ctor ctor;
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
 	uma_import import;
 	uma_release release;
 	void *arg;
 	uma_keg_t keg;
 	int align;
 	uint32_t flags;
 };
 
 struct uma_kctor_args {
 	uma_zone_t zone;
 	size_t size;
 	uma_init uminit;
 	uma_fini fini;
 	int align;
 	uint32_t flags;
 };
 
 struct uma_bucket_zone {
 	uma_zone_t	ubz_zone;
 	char		*ubz_name;
 	int		ubz_entries;	/* Number of items it can hold. */
 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
 };
 
 /*
  * Compute the actual number of bucket entries to pack them in power
  * of two sizes for more efficient space utilization.
  */
 #define	BUCKET_SIZE(n)						\
     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
 
 #define	BUCKET_MAX	BUCKET_SIZE(256)
 
 struct uma_bucket_zone bucket_zones[] = {
 	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
 	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
 	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
 	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
 	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
 	{ NULL, NULL, 0}
 };
 
 /*
  * Flags and enumerations to be passed to internal functions.
  */
 enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
 
 #define	UMA_ANYDOMAIN	-1	/* Special value for domain search. */
 
 /* Prototypes.. */
 
 int	uma_startup_count(int);
 void	uma_startup(void *, int);
 void	uma_startup1(void);
 void	uma_startup2(void);
 
 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void page_free(void *, vm_size_t, uint8_t);
 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
 static int keg_ctor(void *, int, void *, int);
 static void keg_dtor(void *, int, void *);
 static int zone_ctor(void *, int, void *, int);
 static void zone_dtor(void *, int, void *);
 static int zero_init(void *, int, int);
 static void keg_small_init(uma_keg_t keg);
 static void keg_large_init(uma_keg_t keg);
 static void zone_foreach(void (*zfunc)(uma_zone_t));
 static void zone_timeout(uma_zone_t zone);
 static int hash_alloc(struct uma_hash *);
 static int hash_expand(struct uma_hash *, struct uma_hash *);
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
 static void *zone_alloc_item(uma_zone_t, void *, int, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 static void bucket_zone_drain(void);
 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
 static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int);
 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, uint32_t flags);
 static int zone_import(uma_zone_t, void **, int, int, int);
 static void zone_release(uma_zone_t, void **, int);
 static void uma_zero_item(void *, uma_zone_t);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 
 #ifdef INVARIANTS
 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
 
 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
     "Memory allocation debugging");
 
 static u_int dbg_divisor = 1;
 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
     "Debug & thrash every this item in memory allocator");
 
 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
     &uma_dbg_cnt, "memory items debugged");
 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
     &uma_skip_cnt, "memory items skipped, not debugged");
 #endif
 
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
 static int zone_warnings = 1;
 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
     "Warn when UMA zones becomes full");
 
 /* Adjust bytes under management by UMA. */
 static inline void
 uma_total_dec(unsigned long size)
 {
 
 	atomic_subtract_long(&uma_kmem_total, size);
 }
 
 static inline void
 uma_total_inc(unsigned long size)
 {
 
 	if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
 		uma_reclaim_wakeup();
 }
 
 /*
  * This routine checks to see whether or not it's safe to enable buckets.
  */
 static void
 bucket_enable(void)
 {
 	bucketdisable = vm_page_count_min();
 }
 
 /*
  * Initialize bucket_zones, the array of zones of buckets of various sizes.
  *
  * For each zone, calculate the memory required for each bucket, consisting
  * of the header and an array of pointers.
  */
 static void
 bucket_init(void)
 {
 	struct uma_bucket_zone *ubz;
 	int size;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
 	}
 }
 
 /*
  * Given a desired number of entries for a bucket, return the zone from which
  * to allocate the bucket.
  */
 static struct uma_bucket_zone *
 bucket_zone_lookup(int entries)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		if (ubz->ubz_entries >= entries)
 			return (ubz);
 	ubz--;
 	return (ubz);
 }
 
 static int
 bucket_select(int size)
 {
 	struct uma_bucket_zone *ubz;
 
 	ubz = &bucket_zones[0];
 	if (size > ubz->ubz_maxsize)
 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
 
 	for (; ubz->ubz_entries != 0; ubz++)
 		if (ubz->ubz_maxsize < size)
 			break;
 	ubz--;
 	return (ubz->ubz_entries);
 }
 
 static uma_bucket_t
 bucket_alloc(uma_zone_t zone, void *udata, int flags)
 {
 	struct uma_bucket_zone *ubz;
 	uma_bucket_t bucket;
 
 	/*
 	 * This is to stop us from allocating per cpu buckets while we're
 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
 	 * boot pages.  This also prevents us from allocating buckets in
 	 * low memory situations.
 	 */
 	if (bucketdisable)
 		return (NULL);
 	/*
 	 * To limit bucket recursion we store the original zone flags
 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
 	 * NOVM flag to persist even through deep recursions.  We also
 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
 	 * a bucket for a bucket zone so we do not allow infinite bucket
 	 * recursion.  This cookie will even persist to frees of unused
 	 * buckets via the allocation path or bucket allocations in the
 	 * free path.
 	 */
 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 		udata = (void *)(uintptr_t)zone->uz_flags;
 	else {
 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
 			return (NULL);
 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
 	}
 	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
 		flags |= M_NOVM;
 	ubz = bucket_zone_lookup(zone->uz_count);
 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
 		ubz++;
 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
 	if (bucket) {
 #ifdef INVARIANTS
 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 #endif
 		bucket->ub_cnt = 0;
 		bucket->ub_entries = ubz->ubz_entries;
 	}
 
 	return (bucket);
 }
 
 static void
 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 {
 	struct uma_bucket_zone *ubz;
 
 	KASSERT(bucket->ub_cnt == 0,
 	    ("bucket_free: Freeing a non free bucket."));
 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 		udata = (void *)(uintptr_t)zone->uz_flags;
 	ubz = bucket_zone_lookup(bucket->ub_entries);
 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
 }
 
 static void
 bucket_zone_drain(void)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		zone_drain(ubz->ubz_zone);
 }
 
 static void
 zone_log_warning(uma_zone_t zone)
 {
 	static const struct timeval warninterval = { 300, 0 };
 
 	if (!zone_warnings || zone->uz_warning == NULL)
 		return;
 
 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 }
 
 static inline void
 zone_maxaction(uma_zone_t zone)
 {
 
 	if (zone->uz_maxaction.ta_func != NULL)
 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
 }
 
 static void
 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 {
 	uma_klink_t klink;
 
 	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
 		kegfn(klink->kl_keg);
 }
 
 /*
  * Routine called by timeout which is used to fire off some time interval
  * based calculations.  (stats, hash size, etc.)
  *
  * Arguments:
  *	arg   Unused
  *
  * Returns:
  *	Nothing
  */
 static void
 uma_timeout(void *unused)
 {
 	bucket_enable();
 	zone_foreach(zone_timeout);
 
 	/* Reschedule this event */
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 }
 
 /*
  * Routine to perform timeout driven calculations.  This expands the
  * hashes and does per cpu statistics aggregation.
  *
  *  Returns nothing.
  */
 static void
 keg_timeout(uma_keg_t keg)
 {
 
 	KEG_LOCK(keg);
 	/*
 	 * Expand the keg hash table.
 	 *
 	 * This is done if the number of slabs is larger than the hash size.
 	 * What I'm trying to do here is completely reduce collisions.  This
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 	if (keg->uk_flags & UMA_ZONE_HASH &&
 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
 
 		/*
 		 * This is so involved because allocating and freeing
 		 * while the keg lock is held will lead to deadlock.
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
 		newhash = keg->uk_hash;
 		KEG_UNLOCK(keg);
 		ret = hash_alloc(&newhash);
 		KEG_LOCK(keg);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
 				oldhash = keg->uk_hash;
 				keg->uk_hash = newhash;
 			} else
 				oldhash = newhash;
 
 			KEG_UNLOCK(keg);
 			hash_free(&oldhash);
 			return;
 		}
 	}
 	KEG_UNLOCK(keg);
 }
 
 static void
 zone_timeout(uma_zone_t zone)
 {
 
 	zone_foreach_keg(zone, &keg_timeout);
 }
 
 /*
  * Allocate and zero fill the next sized hash table from the appropriate
  * backing store.
  *
  * Arguments:
  *	hash  A new hash structure with the old hash size in uh_hashsize
  *
  * Returns:
  *	1 on success and 0 on failure.
  */
 static int
 hash_alloc(struct uma_hash *hash)
 {
 	int oldsize;
 	int alloc;
 
 	oldsize = hash->uh_hashsize;
 
 	/* We're just going to go to a power of two greater */
 	if (oldsize)  {
 		hash->uh_hashsize = oldsize * 2;
 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 		    M_UMAHASH, M_NOWAIT);
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 		    UMA_ANYDOMAIN, M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
 		bzero(hash->uh_slab_hash, alloc);
 		hash->uh_hashmask = hash->uh_hashsize - 1;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Expands the hash table for HASH zones.  This is done from zone_timeout
  * to reduce collisions.  This must not be done in the regular allocation
  * path, otherwise, we can recurse on the vm while allocating pages.
  *
  * Arguments:
  *	oldhash  The hash you want to expand
  *	newhash  The hash structure for the new table
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  */
 static int
 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 {
 	uma_slab_t slab;
 	int hval;
 	int i;
 
 	if (!newhash->uh_slab_hash)
 		return (0);
 
 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 		return (0);
 
 	/*
 	 * I need to investigate hash algorithms for resizing without a
 	 * full rehash.
 	 */
 
 	for (i = 0; i < oldhash->uh_hashsize; i++)
 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 			hval = UMA_HASH(newhash, slab->us_data);
 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 			    slab, us_hlink);
 		}
 
 	return (1);
 }
 
 /*
  * Free the hash bucket to the appropriate backing store.
  *
  * Arguments:
  *	slab_hash  The hash bucket we're freeing
  *	hashsize   The number of entries in that hash bucket
  *
  * Returns:
  *	Nothing
  */
 static void
 hash_free(struct uma_hash *hash)
 {
 	if (hash->uh_slab_hash == NULL)
 		return;
 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
 	else
 		free(hash->uh_slab_hash, M_UMAHASH);
 }
 
 /*
  * Frees all outstanding items in a bucket
  *
  * Arguments:
  *	zone   The zone to free to, must be unlocked.
  *	bucket The free/alloc bucket with items, cpu queue must be locked.
  *
  * Returns:
  *	Nothing
  */
 
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
 	int i;
 
 	if (bucket == NULL)
 		return;
 
 	if (zone->uz_fini)
 		for (i = 0; i < bucket->ub_cnt; i++) 
 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
 	bucket->ub_cnt = 0;
 }
 
 /*
  * Drains the per cpu caches for a zone.
  *
  * NOTE: This may only be called while the zone is being turn down, and not
  * during normal operation.  This is necessary in order that we do not have
  * to migrate CPUs to drain the per-CPU caches.
  *
  * Arguments:
  *	zone     The zone to drain, must be unlocked.
  *
  * Returns:
  *	Nothing
  */
 static void
 cache_drain(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	int cpu;
 
 	/*
 	 * XXX: It is safe to not lock the per-CPU caches, because we're
 	 * tearing down the zone anyway.  I.e., there will be no further use
 	 * of the caches at this point.
 	 *
 	 * XXX: It would good to be able to assert that the zone is being
 	 * torn down to prevent improper use of cache_drain().
 	 *
 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
 	 * it is used elsewhere.  Should the tear-down path be made special
 	 * there in some form?
 	 */
 	CPU_FOREACH(cpu) {
 		cache = &zone->uz_cpu[cpu];
 		bucket_drain(zone, cache->uc_allocbucket);
 		bucket_drain(zone, cache->uc_freebucket);
 		if (cache->uc_allocbucket != NULL)
 			bucket_free(zone, cache->uc_allocbucket, NULL);
 		if (cache->uc_freebucket != NULL)
 			bucket_free(zone, cache->uc_freebucket, NULL);
 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
 	}
 	ZONE_LOCK(zone);
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 }
 
 static void
 cache_shrink(uma_zone_t zone)
 {
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
 	ZONE_LOCK(zone);
 	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
 	ZONE_UNLOCK(zone);
 }
 
 static void
 cache_drain_safe_cpu(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_bucket_t b1, b2;
 	int domain;
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
 	b1 = b2 = NULL;
 	ZONE_LOCK(zone);
 	critical_enter();
 	if (zone->uz_flags & UMA_ZONE_NUMA)
 		domain = PCPU_GET(domain);
 	else
 		domain = 0;
 	cache = &zone->uz_cpu[curcpu];
 	if (cache->uc_allocbucket) {
 		if (cache->uc_allocbucket->ub_cnt != 0)
 			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 			    cache->uc_allocbucket, ub_link);
 		else
 			b1 = cache->uc_allocbucket;
 		cache->uc_allocbucket = NULL;
 	}
 	if (cache->uc_freebucket) {
 		if (cache->uc_freebucket->ub_cnt != 0)
 			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 			    cache->uc_freebucket, ub_link);
 		else
 			b2 = cache->uc_freebucket;
 		cache->uc_freebucket = NULL;
 	}
 	critical_exit();
 	ZONE_UNLOCK(zone);
 	if (b1)
 		bucket_free(zone, b1, NULL);
 	if (b2)
 		bucket_free(zone, b2, NULL);
 }
 
 /*
  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
  * This is an expensive call because it needs to bind to all CPUs
  * one by one and enter a critical section on each of them in order
  * to safely access their cache buckets.
  * Zone lock must not be held on call this function.
  */
 static void
 cache_drain_safe(uma_zone_t zone)
 {
 	int cpu;
 
 	/*
 	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
 	 */
 	if (zone)
 		cache_shrink(zone);
 	else
 		zone_foreach(cache_shrink);
 
 	CPU_FOREACH(cpu) {
 		thread_lock(curthread);
 		sched_bind(curthread, cpu);
 		thread_unlock(curthread);
 
 		if (zone)
 			cache_drain_safe_cpu(zone);
 		else
 			zone_foreach(cache_drain_safe_cpu);
 	}
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 }
 
 /*
  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
  */
 static void
 bucket_cache_drain(uma_zone_t zone)
 {
 	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
 	int i;
 
 	/*
 	 * Drain the bucket queues and free the buckets.
 	 */
 	for (i = 0; i < vm_ndomains; i++) {
 		zdom = &zone->uz_domain[i];
 		while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
 			LIST_REMOVE(bucket, ub_link);
 			ZONE_UNLOCK(zone);
 			bucket_drain(zone, bucket);
 			bucket_free(zone, bucket, NULL);
 			ZONE_LOCK(zone);
 		}
 	}
 
 	/*
 	 * Shrink further bucket sizes.  Price of single zone lock collision
 	 * is probably lower then price of global cache drain.
 	 */
 	if (zone->uz_count > zone->uz_count_min)
 		zone->uz_count--;
 }
 
 static void
 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
 {
 	uint8_t *mem;
 	int i;
 	uint8_t flags;
 
 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
 
 	mem = slab->us_data;
 	flags = slab->us_flags;
 	i = start;
 	if (keg->uk_fini != NULL) {
 		for (i--; i > -1; i--)
 #ifdef INVARIANTS
 		/*
 		 * trash_fini implies that dtor was trash_dtor. trash_fini
 		 * would check that memory hasn't been modified since free,
 		 * which executed trash_dtor.
 		 * That's why we need to run uma_dbg_kskip() check here,
 		 * albeit we don't make skip check for other init/fini
 		 * invocations.
 		 */
 		if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
 		    keg->uk_fini != trash_fini)
 #endif
 			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
 			    keg->uk_size);
 	}
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
 	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
 	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
 }
 
 /*
  * Frees pages from a keg back to the system.  This is done on demand from
  * the pageout daemon.
  *
  * Returns nothing.
  */
 static void
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
 	uma_domain_t dom;
 	uma_slab_t slab, tmp;
 	int i;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
 	 * time
 	 */
 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 		return;
 
 	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
 	    keg->uk_name, keg, keg->uk_free);
 	KEG_LOCK(keg);
 	if (keg->uk_free == 0)
 		goto finished;
 
 	for (i = 0; i < vm_ndomains; i++) {
 		dom = &keg->uk_domain[i];
 		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
 			/* We have nowhere to free these to. */
 			if (slab->us_flags & UMA_SLAB_BOOT)
 				continue;
 
 			LIST_REMOVE(slab, us_link);
 			keg->uk_pages -= keg->uk_ppera;
 			keg->uk_free -= keg->uk_ipers;
 
 			if (keg->uk_flags & UMA_ZONE_HASH)
 				UMA_HASH_REMOVE(&keg->uk_hash, slab,
 				    slab->us_data);
 
 			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 		}
 	}
 
 finished:
 	KEG_UNLOCK(keg);
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 		keg_free_slab(keg, slab, keg->uk_ipers);
 	}
 }
 
 static void
 zone_drain_wait(uma_zone_t zone, int waitok)
 {
 
 	/*
 	 * Set draining to interlock with zone_dtor() so we can release our
 	 * locks as we go.  Only dtor() should do a WAITOK call since it
 	 * is the only call that knows the structure will still be available
 	 * when it wakes up.
 	 */
 	ZONE_LOCK(zone);
 	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 		if (waitok == M_NOWAIT)
 			goto out;
 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
 	}
 	zone->uz_flags |= UMA_ZFLAG_DRAINING;
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 	/*
 	 * The DRAINING flag protects us from being freed while
 	 * we're running.  Normally the uma_rwlock would protect us but we
 	 * must be able to release and acquire the right lock for each keg.
 	 */
 	zone_foreach_keg(zone, &keg_drain);
 	ZONE_LOCK(zone);
 	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 	wakeup(zone);
 out:
 	ZONE_UNLOCK(zone);
 }
 
 void
 zone_drain(uma_zone_t zone)
 {
 
 	zone_drain_wait(zone, M_NOWAIT);
 }
 
 /*
  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
  *
  * Arguments:
  *	wait  Shall we wait?
  *
  * Returns:
  *	The slab that was allocated or NULL if there is no memory and the
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait)
 {
 	uma_alloc allocf;
 	uma_slab_t slab;
 	unsigned long size;
 	uint8_t *mem;
 	uint8_t flags;
 	int i;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("keg_alloc_slab: domain %d out of range", domain));
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 	mem = NULL;
 
 	allocf = keg->uk_allocf;
 	KEG_UNLOCK(keg);
 	size = keg->uk_ppera * PAGE_SIZE;
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait);
 		if (slab == NULL)
 			goto out;
 	}
 
 	/*
 	 * This reproduces the old vm_zone behavior of zero filling pages the
 	 * first time they are added to a zone.
 	 *
 	 * Malloced items are zeroed in uma_zalloc.
 	 */
 
 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		wait |= M_ZERO;
 	else
 		wait &= ~M_ZERO;
 
 	if (keg->uk_flags & UMA_ZONE_NODUMP)
 		wait |= M_NODUMP;
 
 	/* zone is passed for legacy reasons. */
 	mem = allocf(zone, size, domain, &flags, wait);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
 		slab = NULL;
 		goto out;
 	}
 	uma_total_inc(size);
 
 	/* Point the slab into the allocated memory */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
 
 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
 		for (i = 0; i < keg->uk_ppera; i++)
 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 
 	slab->us_keg = keg;
 	slab->us_data = mem;
 	slab->us_freecount = keg->uk_ipers;
 	slab->us_flags = flags;
 	slab->us_domain = domain;
 	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
 #ifdef INVARIANTS
 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
 #endif
 
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
 			    keg->uk_size, wait) != 0)
 				break;
 		if (i != keg->uk_ipers) {
 			keg_free_slab(keg, slab, i);
 			slab = NULL;
 			goto out;
 		}
 	}
 out:
 	KEG_LOCK(keg);
 
 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
 	    slab, keg->uk_name, keg);
 
 	if (slab != NULL) {
 		if (keg->uk_flags & UMA_ZONE_HASH)
 			UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
 		keg->uk_pages += keg->uk_ppera;
 		keg->uk_free += keg->uk_ipers;
 	}
 
 	return (slab);
 }
 
 /*
  * This function is intended to be used early on in place of page_alloc() so
  * that we may use the boot time page cache to satisfy allocations before
  * the VM is ready.
  */
 static void *
 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	uma_keg_t keg;
 	void *mem;
 	int pages;
 
 	keg = zone_first_keg(zone);
 
 	/*
 	 * If we are in BOOT_BUCKETS or higher, than switch to real
 	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
 	 */
 	switch (booted) {
 		case BOOT_COLD:
 		case BOOT_STRAPPED:
 			break;
 		case BOOT_PAGEALLOC:
 			if (keg->uk_ppera > 1)
 				break;
 		case BOOT_BUCKETS:
 		case BOOT_RUNNING:
 #ifdef UMA_MD_SMALL_ALLOC
 			keg->uk_allocf = (keg->uk_ppera > 1) ?
 			    page_alloc : uma_small_alloc;
 #else
 			keg->uk_allocf = page_alloc;
 #endif
 			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
 	}
 
 	/*
 	 * Check our small startup cache to see if it has pages remaining.
 	 */
 	pages = howmany(bytes, PAGE_SIZE);
 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
 	if (pages > boot_pages)
 		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
 #ifdef DIAGNOSTIC
 	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
 	    boot_pages);
 #endif
 	mem = bootmem;
 	boot_pages -= pages;
 	bootmem += pages * PAGE_SIZE;
 	*pflag = UMA_SLAB_BOOT;
 
 	return (mem);
 }
 
 /*
  * Allocates a number of pages from the system
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait  Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	void *p;	/* Returned page */
 
 	*pflag = UMA_SLAB_KERNEL;
-	p = (void *) kmem_malloc_domain(domain, bytes, wait);
+	p = (void *) kmem_malloc_domain(kernel_arena, domain, bytes, wait);
 
 	return (p);
 }
 
 /*
  * Allocates a number of pages from within an object
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait   Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
     int wait)
 {
 	TAILQ_HEAD(, vm_page) alloctail;
 	u_long npages;
 	vm_offset_t retkva, zkva;
 	vm_page_t p, p_next;
 	uma_keg_t keg;
 
 	TAILQ_INIT(&alloctail);
 	keg = zone_first_keg(zone);
 
 	npages = howmany(bytes, PAGE_SIZE);
 	while (npages > 0) {
 		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
 		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
 		    VM_ALLOC_NOWAIT));
 		if (p != NULL) {
 			/*
 			 * Since the page does not belong to an object, its
 			 * listq is unused.
 			 */
 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
 			npages--;
 			continue;
 		}
 		/*
 		 * Page allocation failed, free intermediate pages and
 		 * exit.
 		 */
 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
 			vm_page_unwire(p, PQ_NONE);
 			vm_page_free(p); 
 		}
 		return (NULL);
 	}
 	*flags = UMA_SLAB_PRIV;
 	zkva = keg->uk_kva +
 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
 	retkva = zkva;
 	TAILQ_FOREACH(p, &alloctail, listq) {
 		pmap_qenter(zkva, &p, 1);
 		zkva += PAGE_SIZE;
 	}
 
 	return ((void *)retkva);
 }
 
 /*
  * Frees a number of pages to the system
  *
  * Arguments:
  *	mem   A pointer to the memory to be freed
  *	size  The size of the memory being freed
  *	flags The original p->us_flags field
  *
  * Returns:
  *	Nothing
  */
 static void
 page_free(void *mem, vm_size_t size, uint8_t flags)
 {
 	struct vmem *vmem;
 
 	if (flags & UMA_SLAB_KERNEL)
 		vmem = kernel_arena;
 	else
 		panic("UMA: page_free used with invalid flags %x", flags);
 
 	kmem_free(vmem, (vm_offset_t)mem, size);
 }
 
 /*
  * Zero fill initializer
  *
  * Arguments/Returns follow uma_init specifications
  */
 static int
 zero_init(void *mem, int size, int flags)
 {
 	bzero(mem, size);
 	return (0);
 }
 
 /*
  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
  *
  * Arguments
  *	keg  The zone we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_small_init(uma_keg_t keg)
 {
 	u_int rsize;
 	u_int memused;
 	u_int wastedspace;
 	u_int shsize;
 	u_int slabsize;
 
 	if (keg->uk_flags & UMA_ZONE_PCPU) {
 		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
 
 		slabsize = sizeof(struct pcpu);
 		keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
 		    PAGE_SIZE);
 	} else {
 		slabsize = UMA_SLAB_SIZE;
 		keg->uk_ppera = 1;
 	}
 
 	/*
 	 * Calculate the size of each allocation (rsize) according to
 	 * alignment.  If the requested size is smaller than we have
 	 * allocation bits for we round it up.
 	 */
 	rsize = keg->uk_size;
 	if (rsize < slabsize / SLAB_SETSIZE)
 		rsize = slabsize / SLAB_SETSIZE;
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
 	keg->uk_rsize = rsize;
 
 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
 	    keg->uk_rsize < sizeof(struct pcpu),
 	    ("%s: size %u too large", __func__, keg->uk_rsize));
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 		shsize = 0;
 	else 
 		shsize = sizeof(struct uma_slab);
 
 	if (rsize <= slabsize - shsize)
 		keg->uk_ipers = (slabsize - shsize) / rsize;
 	else {
 		/* Handle special case when we have 1 item per slab, so
 		 * alignment requirement can be relaxed. */
 		KASSERT(keg->uk_size <= slabsize - shsize,
 		    ("%s: size %u greater than slab", __func__, keg->uk_size));
 		keg->uk_ipers = 1;
 	}
 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
 
 	memused = keg->uk_ipers * rsize + shsize;
 	wastedspace = slabsize - memused;
 
 	/*
 	 * We can't do OFFPAGE if we're internal or if we've been
 	 * asked to not go to the VM for buckets.  If we do this we
 	 * may end up going to the VM  for slabs which we do not
 	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
 	 * of UMA_ZONE_VM, which clearly forbids it.
 	 */
 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
 		return;
 
 	/*
 	 * See if using an OFFPAGE slab will limit our waste.  Only do
 	 * this if it permits more items per-slab.
 	 *
 	 * XXX We could try growing slabsize to limit max waste as well.
 	 * Historically this was not done because the VM could not
 	 * efficiently handle contiguous allocations.
 	 */
 	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
 	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
 		keg->uk_ipers = slabsize / keg->uk_rsize;
 		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
 		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
 		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
 		    "keg: %s(%p), calculated wastedspace = %d, "
 		    "maximum wasted space allowed = %d, "
 		    "calculated ipers = %d, "
 		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
 		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
 		    slabsize - keg->uk_ipers * keg->uk_rsize);
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
 	}
 
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 }
 
 /*
  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
  * more complicated.
  *
  * Arguments
  *	keg  The keg we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_large_init(uma_keg_t keg)
 {
 	u_int shsize;
 
 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
 	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
 	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
 
 	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
 	keg->uk_ipers = 1;
 	keg->uk_rsize = keg->uk_size;
 
 	/* Check whether we have enough space to not do OFFPAGE. */
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
 		shsize = sizeof(struct uma_slab);
 		if (shsize & UMA_ALIGN_PTR)
 			shsize = (shsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
 
 		if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) {
 			/*
 			 * We can't do OFFPAGE if we're internal, in which case
 			 * we need an extra page per allocation to contain the
 			 * slab header.
 			 */
 			if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
 				keg->uk_flags |= UMA_ZONE_OFFPAGE;
 			else
 				keg->uk_ppera++;
 		}
 	}
 
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 }
 
 static void
 keg_cachespread_init(uma_keg_t keg)
 {
 	int alignsize;
 	int trailer;
 	int pages;
 	int rsize;
 
 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
 	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
 
 	alignsize = keg->uk_align + 1;
 	rsize = keg->uk_size;
 	/*
 	 * We want one item to start on every align boundary in a page.  To
 	 * do this we will span pages.  We will also extend the item by the
 	 * size of align if it is an even multiple of align.  Otherwise, it
 	 * would fall on the same boundary every time.
 	 */
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + alignsize;
 	if ((rsize & alignsize) == 0)
 		rsize += alignsize;
 	trailer = rsize - keg->uk_size;
 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
 	keg->uk_rsize = rsize;
 	keg->uk_ppera = pages;
 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
 	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
 	    keg->uk_ipers));
 }
 
 /*
  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
  * the keg onto the global keg list.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_kctor_args
  */
 static int
 keg_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_kctor_args *arg = udata;
 	uma_keg_t keg = mem;
 	uma_zone_t zone;
 
 	bzero(keg, size);
 	keg->uk_size = arg->size;
 	keg->uk_init = arg->uminit;
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
 	keg->uk_cursor = 0;
 	keg->uk_free = 0;
 	keg->uk_reserve = 0;
 	keg->uk_pages = 0;
 	keg->uk_flags = arg->flags;
 	keg->uk_slabzone = NULL;
 
 	/*
 	 * The master zone is passed to us at keg-creation time.
 	 */
 	zone = arg->zone;
 	keg->uk_name = zone->uz_name;
 
 	if (arg->flags & UMA_ZONE_VM)
 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
 
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
 	if (arg->flags & UMA_ZONE_MALLOC)
 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
 
 	if (arg->flags & UMA_ZONE_PCPU)
 #ifdef SMP
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
 #else
 		keg->uk_flags &= ~UMA_ZONE_PCPU;
 #endif
 
 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
 		keg_cachespread_init(keg);
 	} else {
 		if (keg->uk_size > UMA_SLAB_SPACE)
 			keg_large_init(keg);
 		else
 			keg_small_init(keg);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 		keg->uk_slabzone = slabzone;
 
 	/*
 	 * If we haven't booted yet we need allocations to go through the
 	 * startup cache until the vm is ready.
 	 */
 	if (booted < BOOT_PAGEALLOC)
 		keg->uk_allocf = startup_alloc;
 #ifdef UMA_MD_SMALL_ALLOC
 	else if (keg->uk_ppera == 1)
 		keg->uk_allocf = uma_small_alloc;
 #endif
 	else
 		keg->uk_allocf = page_alloc;
 #ifdef UMA_MD_SMALL_ALLOC
 	if (keg->uk_ppera == 1)
 		keg->uk_freef = uma_small_free;
 	else
 #endif
 		keg->uk_freef = page_free;
 
 	/*
 	 * Initialize keg's lock
 	 */
 	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
 	 * figure out where in each page it goes.  This calculates a right
 	 * justified offset into the memory on an ALIGN_PTR boundary.
 	 */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
 		u_int totsize;
 
 		/* Size of the slab struct and free list */
 		totsize = sizeof(struct uma_slab);
 
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
 
 		/*
 		 * The only way the following is possible is if with our
 		 * UMA_ALIGN_PTR adjustments we are now bigger than
 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
 		 * mathematically possible for all cases, so we make
 		 * sure here anyway.
 		 */
 		totsize = keg->uk_pgoff + sizeof(struct uma_slab);
 		if (totsize > PAGE_SIZE * keg->uk_ppera) {
 			printf("zone %s ipers %d rsize %d size %d\n",
 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
 			    keg->uk_size);
 			panic("UMA slab won't fit.");
 		}
 	}
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		hash_alloc(&keg->uk_hash);
 
 	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
 	    keg, zone->uz_name, zone,
 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
 	    keg->uk_free);
 
 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
 	rw_wlock(&uma_rwlock);
 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 	rw_wunlock(&uma_rwlock);
 	return (0);
 }
 
 /*
  * Zone header ctor.  This initializes all fields, locks, etc.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_zctor_args
  */
 static int
 zone_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_zctor_args *arg = udata;
 	uma_zone_t zone = mem;
 	uma_zone_t z;
 	uma_keg_t keg;
 
 	bzero(zone, size);
 	zone->uz_name = arg->name;
 	zone->uz_ctor = arg->ctor;
 	zone->uz_dtor = arg->dtor;
 	zone->uz_slab = zone_fetch_slab;
 	zone->uz_init = NULL;
 	zone->uz_fini = NULL;
 	zone->uz_allocs = 0;
 	zone->uz_frees = 0;
 	zone->uz_fails = 0;
 	zone->uz_sleeps = 0;
 	zone->uz_count = 0;
 	zone->uz_count_min = 0;
 	zone->uz_flags = 0;
 	zone->uz_warning = NULL;
 	/* The domain structures follow the cpu structures. */
 	zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
 	timevalclear(&zone->uz_ratecheck);
 	keg = arg->keg;
 
 	ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
 
 	/*
 	 * This is a pure cache zone, no kegs.
 	 */
 	if (arg->import) {
 		if (arg->flags & UMA_ZONE_VM)
 			arg->flags |= UMA_ZFLAG_CACHEONLY;
 		zone->uz_flags = arg->flags;
 		zone->uz_size = arg->size;
 		zone->uz_import = arg->import;
 		zone->uz_release = arg->release;
 		zone->uz_arg = arg->arg;
 		zone->uz_lockptr = &zone->uz_lock;
 		rw_wlock(&uma_rwlock);
 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
 		rw_wunlock(&uma_rwlock);
 		goto out;
 	}
 
 	/*
 	 * Use the regular zone/keg/slab allocator.
 	 */
 	zone->uz_import = (uma_import)zone_import;
 	zone->uz_release = (uma_release)zone_release;
 	zone->uz_arg = zone; 
 
 	if (arg->flags & UMA_ZONE_SECONDARY) {
 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
 		zone->uz_init = arg->uminit;
 		zone->uz_fini = arg->fini;
 		zone->uz_lockptr = &keg->uk_lock;
 		zone->uz_flags |= UMA_ZONE_SECONDARY;
 		rw_wlock(&uma_rwlock);
 		ZONE_LOCK(zone);
 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 			if (LIST_NEXT(z, uz_link) == NULL) {
 				LIST_INSERT_AFTER(z, zone, uz_link);
 				break;
 			}
 		}
 		ZONE_UNLOCK(zone);
 		rw_wunlock(&uma_rwlock);
 	} else if (keg == NULL) {
 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
 		    arg->align, arg->flags)) == NULL)
 			return (ENOMEM);
 	} else {
 		struct uma_kctor_args karg;
 		int error;
 
 		/* We should only be here from uma_startup() */
 		karg.size = arg->size;
 		karg.uminit = arg->uminit;
 		karg.fini = arg->fini;
 		karg.align = arg->align;
 		karg.flags = arg->flags;
 		karg.zone = zone;
 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
 		    flags);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Link in the first keg.
 	 */
 	zone->uz_klink.kl_keg = keg;
 	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
 	zone->uz_lockptr = &keg->uk_lock;
 	zone->uz_size = keg->uk_size;
 	zone->uz_flags |= (keg->uk_flags &
 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
 
 	/*
 	 * Some internal zones don't have room allocated for the per cpu
 	 * caches.  If we're internal, bail out here.
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		return (0);
 	}
 
 out:
 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
 	    ("Invalid zone flag combination"));
 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
 		zone->uz_count = BUCKET_MAX;
 	else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
 		zone->uz_count = 0;
 	else
 		zone->uz_count = bucket_select(zone->uz_size);
 	zone->uz_count_min = zone->uz_count;
 
 	return (0);
 }
 
 /*
  * Keg header dtor.  This frees all data, destroys locks, frees the hash
  * table and removes the keg from the global list.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 keg_dtor(void *arg, int size, void *udata)
 {
 	uma_keg_t keg;
 
 	keg = (uma_keg_t)arg;
 	KEG_LOCK(keg);
 	if (keg->uk_free != 0) {
 		printf("Freed UMA keg (%s) was not empty (%d items). "
 		    " Lost %d pages of memory.\n",
 		    keg->uk_name ? keg->uk_name : "",
 		    keg->uk_free, keg->uk_pages);
 	}
 	KEG_UNLOCK(keg);
 
 	hash_free(&keg->uk_hash);
 
 	KEG_LOCK_FINI(keg);
 }
 
 /*
  * Zone header dtor.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
 	uma_klink_t klink;
 	uma_zone_t zone;
 	uma_keg_t keg;
 
 	zone = (uma_zone_t)arg;
 	keg = zone_first_keg(zone);
 
 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
 
 	rw_wlock(&uma_rwlock);
 	LIST_REMOVE(zone, uz_link);
 	rw_wunlock(&uma_rwlock);
 	/*
 	 * XXX there are some races here where
 	 * the zone can be drained but zone lock
 	 * released and then refilled before we
 	 * remove it... we dont care for now
 	 */
 	zone_drain_wait(zone, M_WAITOK);
 	/*
 	 * Unlink all of our kegs.
 	 */
 	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
 		klink->kl_keg = NULL;
 		LIST_REMOVE(klink, kl_link);
 		if (klink == &zone->uz_klink)
 			continue;
 		free(klink, M_TEMP);
 	}
 	/*
 	 * We only destroy kegs from non secondary zones.
 	 */
 	if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
 		rw_wlock(&uma_rwlock);
 		LIST_REMOVE(keg, uk_link);
 		rw_wunlock(&uma_rwlock);
 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
 	}
 	ZONE_LOCK_FINI(zone);
 }
 
 /*
  * Traverses every zone in the system and calls a callback
  *
  * Arguments:
  *	zfunc  A pointer to a function which accepts a zone
  *		as an argument.
  *
  * Returns:
  *	Nothing
  */
 static void
 zone_foreach(void (*zfunc)(uma_zone_t))
 {
 	uma_keg_t keg;
 	uma_zone_t zone;
 
 	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
 			zfunc(zone);
 	}
 	rw_runlock(&uma_rwlock);
 }
 
 /*
  * Count how many pages do we need to bootstrap.  VM supplies
  * its need in early zones in the argument, we add up our zones,
  * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
  * zone of zones and zone of kegs are accounted separately.
  */
 #define	UMA_BOOT_ZONES	11
 /* Zone of zones and zone of kegs have arbitrary alignment. */
 #define	UMA_BOOT_ALIGN	32
 static int zsize, ksize;
 int
 uma_startup_count(int vm_zones)
 {
 	int zones, pages;
 
 	ksize = sizeof(struct uma_keg) +
 	    (sizeof(struct uma_domain) * vm_ndomains);
 	zsize = sizeof(struct uma_zone) +
 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
 
 	/*
 	 * Memory for the zone of kegs and its keg,
 	 * and for zone of zones.
 	 */
 	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
 	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
 
 #ifdef	UMA_MD_SMALL_ALLOC
 	zones = UMA_BOOT_ZONES;
 #else
 	zones = UMA_BOOT_ZONES + vm_zones;
 	vm_zones = 0;
 #endif
 
 	/* Memory for the rest of startup zones, UMA and VM, ... */
 	if (zsize > UMA_SLAB_SPACE)
 		pages += (zones + vm_zones) *
 		    howmany(roundup2(zsize, UMA_BOOT_ALIGN), UMA_SLAB_SIZE);
 	else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
 		pages += zones;
 	else
 		pages += howmany(zones,
 		    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
 
 	/* ... and their kegs. Note that zone of zones allocates a keg! */
 	pages += howmany(zones + 1,
 	    UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
 
 	/*
 	 * Most of startup zones are not going to be offpages, that's
 	 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
 	 * calculations.  Some large bucket zones will be offpage, and
 	 * thus will allocate hashes.  We take conservative approach
 	 * and assume that all zones may allocate hash.  This may give
 	 * us some positive inaccuracy, usually an extra single page.
 	 */
 	pages += howmany(zones, UMA_SLAB_SPACE /
 	    (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
 
 	return (pages);
 }
 
 void
 uma_startup(void *mem, int npages)
 {
 	struct uma_zctor_args args;
 	uma_keg_t masterkeg;
 	uintptr_t m;
 
 #ifdef DIAGNOSTIC
 	printf("Entering %s with %d boot pages configured\n", __func__, npages);
 #endif
 
 	rw_init(&uma_rwlock, "UMA lock");
 
 	/* Use bootpages memory for the zone of zones and zone of kegs. */
 	m = (uintptr_t)mem;
 	zones = (uma_zone_t)m;
 	m += roundup(zsize, CACHE_LINE_SIZE);
 	kegs = (uma_zone_t)m;
 	m += roundup(zsize, CACHE_LINE_SIZE);
 	masterkeg = (uma_keg_t)m;
 	m += roundup(ksize, CACHE_LINE_SIZE);
 	m = roundup(m, PAGE_SIZE);
 	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
 	mem = (void *)m;
 
 	/* "manually" create the initial zone */
 	memset(&args, 0, sizeof(args));
 	args.name = "UMA Kegs";
 	args.size = ksize;
 	args.ctor = keg_ctor;
 	args.dtor = keg_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = masterkeg;
 	args.align = UMA_BOOT_ALIGN - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	zone_ctor(kegs, zsize, &args, M_WAITOK);
 
 	bootmem = mem;
 	boot_pages = npages;
 
 	args.name = "UMA Zones";
 	args.size = zsize;
 	args.ctor = zone_ctor;
 	args.dtor = zone_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = NULL;
 	args.align = UMA_BOOT_ALIGN - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	zone_ctor(zones, zsize, &args, M_WAITOK);
 
 	/* Now make a zone for slab headers */
 	slabzone = uma_zcreate("UMA Slabs",
 				sizeof(struct uma_slab),
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	bucket_init();
 
 	booted = BOOT_STRAPPED;
 }
 
 void
 uma_startup1(void)
 {
 
 #ifdef DIAGNOSTIC
 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
 #endif
 	booted = BOOT_PAGEALLOC;
 }
 
 void
 uma_startup2(void)
 {
 
 #ifdef DIAGNOSTIC
 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
 #endif
 	booted = BOOT_BUCKETS;
 	sx_init(&uma_drain_lock, "umadrain");
 	bucket_enable();
 }
 
 /*
  * Initialize our callout handle
  *
  */
 static void
 uma_startup3(void)
 {
 
 #ifdef INVARIANTS
 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
 #endif
 	callout_init(&uma_callout, 1);
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 	booted = BOOT_RUNNING;
 }
 
 static uma_keg_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 		int align, uint32_t flags)
 {
 	struct uma_kctor_args args;
 
 	args.size = size;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* Public functions */
 /* See uma.h */
 void
 uma_set_align(int align)
 {
 
 	if (align != UMA_ALIGN_CACHE)
 		uma_align_cache = align;
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
 
 {
 	struct uma_zctor_args args;
 	uma_zone_t res;
 	bool locked;
 
 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
 	    align, name));
 
 	/* This stuff is essential for the zone ctor */
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = uminit;
 	args.fini = fini;
 #ifdef  INVARIANTS
 	/*
 	 * If a zone is being created with an empty constructor and
 	 * destructor, pass UMA constructor/destructor which checks for
 	 * memory use after free.
 	 */
 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
 	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
 		args.ctor = trash_ctor;
 		args.dtor = trash_dtor;
 		args.uminit = trash_init;
 		args.fini = trash_fini;
 	}
 #endif
 	args.align = align;
 	args.flags = flags;
 	args.keg = NULL;
 
 	if (booted < BOOT_BUCKETS) {
 		locked = false;
 	} else {
 		sx_slock(&uma_drain_lock);
 		locked = true;
 	}
 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	if (locked)
 		sx_sunlock(&uma_drain_lock);
 	return (res);
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
 {
 	struct uma_zctor_args args;
 	uma_keg_t keg;
 	uma_zone_t res;
 	bool locked;
 
 	keg = zone_first_keg(master);
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = keg->uk_size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.align = keg->uk_align;
 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
 	args.keg = keg;
 
 	if (booted < BOOT_BUCKETS) {
 		locked = false;
 	} else {
 		sx_slock(&uma_drain_lock);
 		locked = true;
 	}
 	/* XXX Attaches only one keg of potentially many. */
 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	if (locked)
 		sx_sunlock(&uma_drain_lock);
 	return (res);
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_import zimport,
 		    uma_release zrelease, void *arg, int flags)
 {
 	struct uma_zctor_args args;
 
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.import = zimport;
 	args.release = zrelease;
 	args.arg = arg;
 	args.align = 0;
 	args.flags = flags;
 
 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 static void
 zone_lock_pair(uma_zone_t a, uma_zone_t b)
 {
 	if (a < b) {
 		ZONE_LOCK(a);
 		mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
 	} else {
 		ZONE_LOCK(b);
 		mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
 	}
 }
 
 static void
 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
 {
 
 	ZONE_UNLOCK(a);
 	ZONE_UNLOCK(b);
 }
 
 int
 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
 {
 	uma_klink_t klink;
 	uma_klink_t kl;
 	int error;
 
 	error = 0;
 	klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
 
 	zone_lock_pair(zone, master);
 	/*
 	 * zone must use vtoslab() to resolve objects and must already be
 	 * a secondary.
 	 */
 	if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
 	    != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The new master must also use vtoslab().
 	 */
 	if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * The underlying object must be the same size.  rsize
 	 * may be different.
 	 */
 	if (master->uz_size != zone->uz_size) {
 		error = E2BIG;
 		goto out;
 	}
 	/*
 	 * Put it at the end of the list.
 	 */
 	klink->kl_keg = zone_first_keg(master);
 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
 		if (LIST_NEXT(kl, kl_link) == NULL) {
 			LIST_INSERT_AFTER(kl, klink, kl_link);
 			break;
 		}
 	}
 	klink = NULL;
 	zone->uz_flags |= UMA_ZFLAG_MULTI;
 	zone->uz_slab = zone_fetch_slab_multi;
 
 out:
 	zone_unlock_pair(zone, master);
 	if (klink != NULL)
 		free(klink, M_TEMP);
 
 	return (error);
 }
 
 
 /* See uma.h */
 void
 uma_zdestroy(uma_zone_t zone)
 {
 
 	sx_slock(&uma_drain_lock);
 	zone_free_item(zones, zone, NULL, SKIP_NONE);
 	sx_sunlock(&uma_drain_lock);
 }
 
 void
 uma_zwait(uma_zone_t zone)
 {
 	void *item;
 
 	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
 	uma_zfree(zone, item);
 }
 
 void *
 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
 {
 	void *item;
 	int i;
 
 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
 	item = uma_zalloc_arg(zone, udata, flags &~ M_ZERO);
 	if (item != NULL && (flags & M_ZERO)) {
 		CPU_FOREACH(i)
 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
 	}
 	return (item);
 }
 
 /*
  * A stub while both regular and pcpu cases are identical.
  */
 void
 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
 {
 
 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
 	uma_zfree_arg(zone, item, udata);
 }
 
 /* See uma.h */
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
 	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
 	uma_cache_t cache;
 	void *item;
 	int cpu, domain, lockfail;
 #ifdef INVARIANTS
 	bool skipdbg;
 #endif
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
 
 	/* This is the fast path allocation */
 	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
 	    curthread, zone->uz_name, zone, flags);
 
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
+	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zalloc_arg: called with spinlock or critical section held"));
 	if (zone->uz_flags & UMA_ZONE_PCPU)
 		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
 		    "with M_ZERO passed"));
 
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp_zone(zone)) {
 		item = memguard_alloc(zone->uz_size, flags);
 		if (item != NULL) {
 			if (zone->uz_init != NULL &&
 			    zone->uz_init(item, zone->uz_size, flags) != 0)
 				return (NULL);
 			if (zone->uz_ctor != NULL &&
 			    zone->uz_ctor(item, zone->uz_size, udata,
 			    flags) != 0) {
 			    	zone->uz_fini(item, zone->uz_size);
 				return (NULL);
 			}
 			return (item);
 		}
 		/* This is unfortunate but should not be fatal. */
 	}
 #endif
 	/*
 	 * If possible, allocate from the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to allocate from
 	 * the current cache; when we re-acquire the critical section, we
 	 * must detect and handle migration if it has occurred.
 	 */
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zalloc_start:
 	bucket = cache->uc_allocbucket;
 	if (bucket != NULL && bucket->ub_cnt > 0) {
 		bucket->ub_cnt--;
 		item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
 #endif
 		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
 		cache->uc_allocs++;
 		critical_exit();
 #ifdef INVARIANTS
 		skipdbg = uma_dbg_zskip(zone, item);
 #endif
 		if (zone->uz_ctor != NULL &&
 #ifdef INVARIANTS
 		    (!skipdbg || zone->uz_ctor != trash_ctor ||
 		    zone->uz_dtor != trash_dtor) &&
 #endif
 		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
 			atomic_add_long(&zone->uz_fails, 1);
 			zone_free_item(zone, item, udata, SKIP_DTOR);
 			return (NULL);
 		}
 #ifdef INVARIANTS
 		if (!skipdbg)
 			uma_dbg_alloc(zone, NULL, item);
 #endif
 		if (flags & M_ZERO)
 			uma_zero_item(item, zone);
 		return (item);
 	}
 
 	/*
 	 * We have run out of items in our alloc bucket.
 	 * See if we can switch with our free bucket.
 	 */
 	bucket = cache->uc_freebucket;
 	if (bucket != NULL && bucket->ub_cnt > 0) {
 		CTR2(KTR_UMA,
 		    "uma_zalloc: zone %s(%p) swapping empty with alloc",
 		    zone->uz_name, zone);
 		cache->uc_freebucket = cache->uc_allocbucket;
 		cache->uc_allocbucket = bucket;
 		goto zalloc_start;
 	}
 
 	/*
 	 * Discard any empty allocation bucket while we hold no locks.
 	 */
 	bucket = cache->uc_allocbucket;
 	cache->uc_allocbucket = NULL;
 	critical_exit();
 	if (bucket != NULL)
 		bucket_free(zone, bucket, udata);
 
 	if (zone->uz_flags & UMA_ZONE_NUMA)
 		domain = PCPU_GET(domain);
 	else
 		domain = UMA_ANYDOMAIN;
 
 	/* Short-circuit for zones without buckets and low memory. */
 	if (zone->uz_count == 0 || bucketdisable)
 		goto zalloc_item;
 
 	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
 	 * we must go back to the zone.  This requires the zone lock, so we
 	 * must drop the critical section, then re-acquire it when we go back
 	 * to the cache.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	lockfail = 0;
 	if (ZONE_TRYLOCK(zone) == 0) {
 		/* Record contention to size the buckets. */
 		ZONE_LOCK(zone);
 		lockfail = 1;
 	}
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 	/* See if we lost the race to fill the cache. */
 	if (cache->uc_allocbucket != NULL) {
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
 
 	/*
 	 * Check the zone's cache of buckets.
 	 */
 	if (domain == UMA_ANYDOMAIN)
 		zdom = &zone->uz_domain[0];
 	else
 		zdom = &zone->uz_domain[domain];
 	if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 
 		LIST_REMOVE(bucket, ub_link);
 		cache->uc_allocbucket = bucket;
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/*
 	 * We bump the uz count when the cache size is insufficient to
 	 * handle the working set.
 	 */
 	if (lockfail && zone->uz_count < BUCKET_MAX)
 		zone->uz_count++;
 	ZONE_UNLOCK(zone);
 
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
 	 * works we'll restart the allocation from the beginning and it
 	 * will use the just filled bucket.
 	 */
 	bucket = zone_alloc_bucket(zone, udata, domain, flags);
 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
 	    zone->uz_name, zone, bucket);
 	if (bucket != NULL) {
 		ZONE_LOCK(zone);
 		critical_enter();
 		cpu = curcpu;
 		cache = &zone->uz_cpu[cpu];
 		/*
 		 * See if we lost the race or were migrated.  Cache the
 		 * initialized bucket to make this less likely or claim
 		 * the memory directly.
 		 */
 		if (cache->uc_allocbucket != NULL ||
 		    (zone->uz_flags & UMA_ZONE_NUMA &&
 		    domain != PCPU_GET(domain)))
 			LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
 		else
 			cache->uc_allocbucket = bucket;
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
 
 	/*
 	 * We may not be able to get a bucket so return an actual item.
 	 */
 zalloc_item:
 	item = zone_alloc_item(zone, udata, domain, flags);
 
 	return (item);
 }
 
 void *
 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
 {
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
 
 	/* This is the fast path allocation */
 	CTR5(KTR_UMA,
 	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
 	    curthread, zone->uz_name, zone, domain, flags);
 
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
 	}
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
 
 	return (zone_alloc_item(zone, udata, domain, flags));
 }
 
 /*
  * Find a slab with some space.  Prefer slabs that are partially used over those
  * that are totally full.  This helps to reduce fragmentation.
  *
  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
  * only 'domain'.
  */
 static uma_slab_t
 keg_first_slab(uma_keg_t keg, int domain, int rr)
 {
 	uma_domain_t dom;
 	uma_slab_t slab;
 	int start;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("keg_first_slab: domain %d out of range", domain));
 
 	slab = NULL;
 	start = domain;
 	do {
 		dom = &keg->uk_domain[domain];
 		if (!LIST_EMPTY(&dom->ud_part_slab))
 			return (LIST_FIRST(&dom->ud_part_slab));
 		if (!LIST_EMPTY(&dom->ud_free_slab)) {
 			slab = LIST_FIRST(&dom->ud_free_slab);
 			LIST_REMOVE(slab, us_link);
 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 			return (slab);
 		}
 		if (rr)
 			domain = (domain + 1) % vm_ndomains;
 	} while (domain != start);
 
 	return (NULL);
 }
 
 static uma_slab_t
 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags)
 {
 	uma_domain_t dom;
 	uma_slab_t slab;
 	int allocflags, domain, reserve, rr, start;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 	reserve = 0;
 	allocflags = flags;
 	if ((flags & M_USE_RESERVE) == 0)
 		reserve = keg->uk_reserve;
 
 	/*
 	 * Round-robin for non first-touch zones when there is more than one
 	 * domain.
 	 */
 	if (vm_ndomains == 1)
 		rdomain = 0;
 	rr = rdomain == UMA_ANYDOMAIN;
 	if (rr) {
 		keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
 		domain = start = keg->uk_cursor;
 		/* Only block on the second pass. */
 		if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK)
 			allocflags = (allocflags & ~M_WAITOK) | M_NOWAIT;
 	} else
 		domain = start = rdomain;
 
 again:
 	do {
 		if (keg->uk_free > reserve &&
 		    (slab = keg_first_slab(keg, domain, rr)) != NULL) {
 			MPASS(slab->us_keg == keg);
 			return (slab);
 		}
 
 		/*
 		 * M_NOVM means don't ask at all!
 		 */
 		if (flags & M_NOVM)
 			break;
 
 		if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
 			keg->uk_flags |= UMA_ZFLAG_FULL;
 			/*
 			 * If this is not a multi-zone, set the FULL bit.
 			 * Otherwise slab_multi() takes care of it.
 			 */
 			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
 				zone->uz_flags |= UMA_ZFLAG_FULL;
 				zone_log_warning(zone);
 				zone_maxaction(zone);
 			}
 			if (flags & M_NOWAIT)
 				return (NULL);
 			zone->uz_sleeps++;
 			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
 			continue;
 		}
 		slab = keg_alloc_slab(keg, zone, domain, allocflags);
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
 		 * at least one item.
 		 */
 		if (slab) {
 			MPASS(slab->us_keg == keg);
 			dom = &keg->uk_domain[slab->us_domain];
 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 			return (slab);
 		}
 		if (rr) {
 			keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
 			domain = keg->uk_cursor;
 		}
 	} while (domain != start);
 
 	/* Retry domain scan with blocking. */
 	if (allocflags != flags) {
 		allocflags = flags;
 		goto again;
 	}
 
 	/*
 	 * We might not have been able to get a slab but another cpu
 	 * could have while we were unlocked.  Check again before we
 	 * fail.
 	 */
 	if (keg->uk_free > reserve &&
 	    (slab = keg_first_slab(keg, domain, rr)) != NULL) {
 		MPASS(slab->us_keg == keg);
 		return (slab);
 	}
 	return (NULL);
 }
 
 static uma_slab_t
 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
 {
 	uma_slab_t slab;
 
 	if (keg == NULL) {
 		keg = zone_first_keg(zone);
 		KEG_LOCK(keg);
 	}
 
 	for (;;) {
 		slab = keg_fetch_slab(keg, zone, domain, flags);
 		if (slab)
 			return (slab);
 		if (flags & (M_NOWAIT | M_NOVM))
 			break;
 	}
 	KEG_UNLOCK(keg);
 	return (NULL);
 }
 
 /*
  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
  * with the keg locked.  On NULL no lock is held.
  *
  * The last pointer is used to seed the search.  It is not required.
  */
 static uma_slab_t
 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags)
 {
 	uma_klink_t klink;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int flags;
 	int empty;
 	int full;
 
 	/*
 	 * Don't wait on the first pass.  This will skip limit tests
 	 * as well.  We don't want to block if we can find a provider
 	 * without blocking.
 	 */
 	flags = (rflags & ~M_WAITOK) | M_NOWAIT;
 	/*
 	 * Use the last slab allocated as a hint for where to start
 	 * the search.
 	 */
 	if (last != NULL) {
 		slab = keg_fetch_slab(last, zone, domain, flags);
 		if (slab)
 			return (slab);
 		KEG_UNLOCK(last);
 	}
 	/*
 	 * Loop until we have a slab incase of transient failures
 	 * while M_WAITOK is specified.  I'm not sure this is 100%
 	 * required but we've done it for so long now.
 	 */
 	for (;;) {
 		empty = 0;
 		full = 0;
 		/*
 		 * Search the available kegs for slabs.  Be careful to hold the
 		 * correct lock while calling into the keg layer.
 		 */
 		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
 			keg = klink->kl_keg;
 			KEG_LOCK(keg);
 			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
 				slab = keg_fetch_slab(keg, zone, domain, flags);
 				if (slab)
 					return (slab);
 			}
 			if (keg->uk_flags & UMA_ZFLAG_FULL)
 				full++;
 			else
 				empty++;
 			KEG_UNLOCK(keg);
 		}
 		if (rflags & (M_NOWAIT | M_NOVM))
 			break;
 		flags = rflags;
 		/*
 		 * All kegs are full.  XXX We can't atomically check all kegs
 		 * and sleep so just sleep for a short period and retry.
 		 */
 		if (full && !empty) {
 			ZONE_LOCK(zone);
 			zone->uz_flags |= UMA_ZFLAG_FULL;
 			zone->uz_sleeps++;
 			zone_log_warning(zone);
 			zone_maxaction(zone);
 			msleep(zone, zone->uz_lockptr, PVM,
 			    "zonelimit", hz/100);
 			zone->uz_flags &= ~UMA_ZFLAG_FULL;
 			ZONE_UNLOCK(zone);
 			continue;
 		}
 	}
 	return (NULL);
 }
 
 static void *
 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
 {
 	uma_domain_t dom;
 	void *item;
 	uint8_t freei;
 
 	MPASS(keg == slab->us_keg);
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 
 	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
 	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
 	item = slab->us_data + (keg->uk_rsize * freei);
 	slab->us_freecount--;
 	keg->uk_free--;
 
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		dom = &keg->uk_domain[slab->us_domain];
 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
 	}
 
 	return (item);
 }
 
 static int
 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
 {
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int stripe;
 	int i;
 
 	slab = NULL;
 	keg = NULL;
 	/* Try to keep the buckets totally full */
 	for (i = 0; i < max; ) {
 		if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL)
 			break;
 		keg = slab->us_keg;
 		stripe = howmany(max, vm_ndomains);
 		while (slab->us_freecount && i < max) { 
 			bucket[i++] = slab_alloc_item(keg, slab);
 			if (keg->uk_free <= keg->uk_reserve)
 				break;
 #ifdef NUMA
 			/*
 			 * If the zone is striped we pick a new slab for every
 			 * N allocations.  Eliminating this conditional will
 			 * instead pick a new domain for each bucket rather
 			 * than stripe within each bucket.  The current option
 			 * produces more fragmentation and requires more cpu
 			 * time but yields better distribution.
 			 */
 			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
 			    vm_ndomains > 1 && --stripe == 0)
 				break;
 #endif
 		}
 		/* Don't block if we allocated any successfully. */
 		flags &= ~M_WAITOK;
 		flags |= M_NOWAIT;
 	}
 	if (slab != NULL)
 		KEG_UNLOCK(keg);
 
 	return i;
 }
 
 static uma_bucket_t
 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
 {
 	uma_bucket_t bucket;
 	int max;
 
 	/* Don't wait for buckets, preserve caller's NOVM setting. */
 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
 	if (bucket == NULL)
 		return (NULL);
 
 	max = MIN(bucket->ub_entries, zone->uz_count);
 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
 	    max, domain, flags);
 
 	/*
 	 * Initialize the memory if necessary.
 	 */
 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
 		int i;
 
 		for (i = 0; i < bucket->ub_cnt; i++)
 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
 			    flags) != 0)
 				break;
 		/*
 		 * If we couldn't initialize the whole bucket, put the
 		 * rest back onto the freelist.
 		 */
 		if (i != bucket->ub_cnt) {
 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
 			    bucket->ub_cnt - i);
 #ifdef INVARIANTS
 			bzero(&bucket->ub_bucket[i],
 			    sizeof(void *) * (bucket->ub_cnt - i));
 #endif
 			bucket->ub_cnt = i;
 		}
 	}
 
 	if (bucket->ub_cnt == 0) {
 		bucket_free(zone, bucket, udata);
 		atomic_add_long(&zone->uz_fails, 1);
 		return (NULL);
 	}
 
 	return (bucket);
 }
 
 /*
  * Allocates a single item from a zone.
  *
  * Arguments
  *	zone   The zone to alloc for.
  *	udata  The data to be passed to the constructor.
  *	domain The domain to allocate from or UMA_ANYDOMAIN.
  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
  *
  * Returns
  *	NULL if there is no memory and M_NOWAIT is set
  *	An item if successful
  */
 
 static void *
 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
 {
 	void *item;
 #ifdef INVARIANTS
 	bool skipdbg;
 #endif
 
 	item = NULL;
 
 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
 		goto fail;
 	atomic_add_long(&zone->uz_allocs, 1);
 
 #ifdef INVARIANTS
 	skipdbg = uma_dbg_zskip(zone, item);
 #endif
 	/*
 	 * We have to call both the zone's init (not the keg's init)
 	 * and the zone's ctor.  This is because the item is going from
 	 * a keg slab directly to the user, and the user is expecting it
 	 * to be both zone-init'd as well as zone-ctor'd.
 	 */
 	if (zone->uz_init != NULL) {
 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
 			zone_free_item(zone, item, udata, SKIP_FINI);
 			goto fail;
 		}
 	}
 	if (zone->uz_ctor != NULL &&
 #ifdef INVARIANTS
 	    (!skipdbg || zone->uz_ctor != trash_ctor ||
 	    zone->uz_dtor != trash_dtor) &&
 #endif
 	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
 		zone_free_item(zone, item, udata, SKIP_DTOR);
 		goto fail;
 	}
 #ifdef INVARIANTS
 	if (!skipdbg)
 		uma_dbg_alloc(zone, NULL, item);
 #endif
 	if (flags & M_ZERO)
 		uma_zero_item(item, zone);
 
 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
 	    zone->uz_name, zone);
 
 	return (item);
 
 fail:
 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
 	    zone->uz_name, zone);
 	atomic_add_long(&zone->uz_fails, 1);
 	return (NULL);
 }
 
 /* See uma.h */
 void
 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 {
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	uma_zone_domain_t zdom;
 	int cpu, domain, lockfail;
 #ifdef INVARIANTS
 	bool skipdbg;
 #endif
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
 
 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
 	    zone->uz_name);
 
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zfree_arg: called with spinlock or critical section held"));
 
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(item)) {
 		if (zone->uz_dtor != NULL)
 			zone->uz_dtor(item, zone->uz_size, udata);
 		if (zone->uz_fini != NULL)
 			zone->uz_fini(item, zone->uz_size);
 		memguard_free(item);
 		return;
 	}
 #endif
 #ifdef INVARIANTS
 	skipdbg = uma_dbg_zskip(zone, item);
 	if (skipdbg == false) {
 		if (zone->uz_flags & UMA_ZONE_MALLOC)
 			uma_dbg_free(zone, udata, item);
 		else
 			uma_dbg_free(zone, NULL, item);
 	}
 	if (zone->uz_dtor != NULL && (!skipdbg ||
 	    zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
 #else
 	if (zone->uz_dtor != NULL)
 #endif
 		zone->uz_dtor(item, zone->uz_size, udata);
 
 	/*
 	 * The race here is acceptable.  If we miss it we'll just have to wait
 	 * a little longer for the limits to be reset.
 	 */
 	if (zone->uz_flags & UMA_ZFLAG_FULL)
 		goto zfree_item;
 
 	/*
 	 * If possible, free to the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to free to the
 	 * current cache; when we re-acquire the critical section, we must
 	 * detect and handle migration if it has occurred.
 	 */
 zfree_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zfree_start:
 	/*
 	 * Try to free into the allocbucket first to give LIFO ordering
 	 * for cache-hot datastructures.  Spill over into the freebucket
 	 * if necessary.  Alloc will swap them if one runs dry.
 	 */
 	bucket = cache->uc_allocbucket;
 	if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
 		bucket = cache->uc_freebucket;
 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
 		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
 		    ("uma_zfree: Freeing to non free bucket index."));
 		bucket->ub_bucket[bucket->ub_cnt] = item;
 		bucket->ub_cnt++;
 		cache->uc_frees++;
 		critical_exit();
 		return;
 	}
 
 	/*
 	 * We must go back the zone, which requires acquiring the zone lock,
 	 * which in turn means we must release and re-acquire the critical
 	 * section.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	critical_exit();
 	if (zone->uz_count == 0 || bucketdisable)
 		goto zfree_item;
 
 	lockfail = 0;
 	if (ZONE_TRYLOCK(zone) == 0) {
 		/* Record contention to size the buckets. */
 		ZONE_LOCK(zone);
 		lockfail = 1;
 	}
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 	/*
 	 * Since we have locked the zone we may as well send back our stats.
 	 */
 	atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
 	atomic_add_long(&zone->uz_frees, cache->uc_frees);
 	cache->uc_allocs = 0;
 	cache->uc_frees = 0;
 
 	bucket = cache->uc_freebucket;
 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
 		ZONE_UNLOCK(zone);
 		goto zfree_start;
 	}
 	cache->uc_freebucket = NULL;
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
 		domain = PCPU_GET(domain);
 	else 
 		domain = 0;
 	zdom = &zone->uz_domain[0];
 
 	/* Can we throw this on the zone full list? */
 	if (bucket != NULL) {
 		CTR3(KTR_UMA,
 		    "uma_zfree: zone %s(%p) putting bucket %p on free list",
 		    zone->uz_name, zone, bucket);
 		/* ub_cnt is pointing to the last free item */
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
 		if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) {
 			ZONE_UNLOCK(zone);
 			bucket_drain(zone, bucket);
 			bucket_free(zone, bucket, udata);
 			goto zfree_restart;
 		} else
 			LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
 	}
 
 	/*
 	 * We bump the uz count when the cache size is insufficient to
 	 * handle the working set.
 	 */
 	if (lockfail && zone->uz_count < BUCKET_MAX)
 		zone->uz_count++;
 	ZONE_UNLOCK(zone);
 
 	bucket = bucket_alloc(zone, udata, M_NOWAIT);
 	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
 	    zone->uz_name, zone, bucket);
 	if (bucket) {
 		critical_enter();
 		cpu = curcpu;
 		cache = &zone->uz_cpu[cpu];
 		if (cache->uc_freebucket == NULL &&
 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
 		    domain == PCPU_GET(domain))) {
 			cache->uc_freebucket = bucket;
 			goto zfree_start;
 		}
 		/*
 		 * We lost the race, start over.  We have to drop our
 		 * critical section to free the bucket.
 		 */
 		critical_exit();
 		bucket_free(zone, bucket, udata);
 		goto zfree_restart;
 	}
 
 	/*
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
 zfree_item:
 	zone_free_item(zone, item, udata, SKIP_DTOR);
 
 	return;
 }
 
 void
 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
 {
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
 
 	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
 	    zone->uz_name);
 
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zfree_domain: called with spinlock or critical section held"));
 
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
 	zone_free_item(zone, item, udata, SKIP_NONE);
 }
 
 static void
 slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
 {
 	uma_domain_t dom;
 	uint8_t freei;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	MPASS(keg == slab->us_keg);
 
 	dom = &keg->uk_domain[slab->us_domain];
 
 	/* Do we need to remove from any lists? */
 	if (slab->us_freecount+1 == keg->uk_ipers) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
 	} else if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 	}
 
 	/* Slab management. */
 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
 	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
 	slab->us_freecount++;
 
 	/* Keg statistics. */
 	keg->uk_free++;
 }
 
 static void
 zone_release(uma_zone_t zone, void **bucket, int cnt)
 {
 	void *item;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	uint8_t *mem;
 	int clearfull;
 	int i;
 
 	clearfull = 0;
 	keg = zone_first_keg(zone);
 	KEG_LOCK(keg);
 	for (i = 0; i < cnt; i++) {
 		item = bucket[i];
 		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 			if (zone->uz_flags & UMA_ZONE_HASH) {
 				slab = hash_sfind(&keg->uk_hash, mem);
 			} else {
 				mem += keg->uk_pgoff;
 				slab = (uma_slab_t)mem;
 			}
 		} else {
 			slab = vtoslab((vm_offset_t)item);
 			if (slab->us_keg != keg) {
 				KEG_UNLOCK(keg);
 				keg = slab->us_keg;
 				KEG_LOCK(keg);
 			}
 		}
 		slab_free_item(keg, slab, item);
 		if (keg->uk_flags & UMA_ZFLAG_FULL) {
 			if (keg->uk_pages < keg->uk_maxpages) {
 				keg->uk_flags &= ~UMA_ZFLAG_FULL;
 				clearfull = 1;
 			}
 
 			/* 
 			 * We can handle one more allocation. Since we're
 			 * clearing ZFLAG_FULL, wake up all procs blocked
 			 * on pages. This should be uncommon, so keeping this
 			 * simple for now (rather than adding count of blocked 
 			 * threads etc).
 			 */
 			wakeup(keg);
 		}
 	}
 	KEG_UNLOCK(keg);
 	if (clearfull) {
 		ZONE_LOCK(zone);
 		zone->uz_flags &= ~UMA_ZFLAG_FULL;
 		wakeup(zone);
 		ZONE_UNLOCK(zone);
 	}
 
 }
 
 /*
  * Frees a single item to any zone.
  *
  * Arguments:
  *	zone   The zone to free to
  *	item   The item we're freeing
  *	udata  User supplied data for the dtor
  *	skip   Skip dtors and finis
  */
 static void
 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
 {
 #ifdef INVARIANTS
 	bool skipdbg;
 
 	skipdbg = uma_dbg_zskip(zone, item);
 	if (skip == SKIP_NONE && !skipdbg) {
 		if (zone->uz_flags & UMA_ZONE_MALLOC)
 			uma_dbg_free(zone, udata, item);
 		else
 			uma_dbg_free(zone, NULL, item);
 	}
 
 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
 	    (!skipdbg || zone->uz_dtor != trash_dtor ||
 	    zone->uz_ctor != trash_ctor))
 #else
 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
 #endif
 		zone->uz_dtor(item, zone->uz_size, udata);
 
 	if (skip < SKIP_FINI && zone->uz_fini)
 		zone->uz_fini(item, zone->uz_size);
 
 	atomic_add_long(&zone->uz_frees, 1);
 	zone->uz_release(zone->uz_arg, &item, 1);
 }
 
 /* See uma.h */
 int
 uma_zone_set_max(uma_zone_t zone, int nitems)
 {
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	if (keg == NULL)
 		return (0);
 	KEG_LOCK(keg);
 	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
 		keg->uk_maxpages += keg->uk_ppera;
 	nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
 	KEG_UNLOCK(keg);
 
 	return (nitems);
 }
 
 /* See uma.h */
 int
 uma_zone_get_max(uma_zone_t zone)
 {
 	int nitems;
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	if (keg == NULL)
 		return (0);
 	KEG_LOCK(keg);
 	nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
 	KEG_UNLOCK(keg);
 
 	return (nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_warning(uma_zone_t zone, const char *warning)
 {
 
 	ZONE_LOCK(zone);
 	zone->uz_warning = warning;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
 {
 
 	ZONE_LOCK(zone);
 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 int
 uma_zone_get_cur(uma_zone_t zone)
 {
 	int64_t nitems;
 	u_int i;
 
 	ZONE_LOCK(zone);
 	nitems = zone->uz_allocs - zone->uz_frees;
 	CPU_FOREACH(i) {
 		/*
 		 * See the comment in sysctl_vm_zone_stats() regarding the
 		 * safety of accessing the per-cpu caches. With the zone lock
 		 * held, it is safe, but can potentially result in stale data.
 		 */
 		nitems += zone->uz_cpu[i].uc_allocs -
 		    zone->uz_cpu[i].uc_frees;
 	}
 	ZONE_UNLOCK(zone);
 
 	return (nitems < 0 ? 0 : nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
 {
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
 	KEG_LOCK(keg);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_init on non-empty keg"));
 	keg->uk_init = uminit;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 void
 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
 {
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
 	KEG_LOCK(keg);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_fini on non-empty keg"));
 	keg->uk_fini = fini;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
 {
 
 	ZONE_LOCK(zone);
 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
 	    ("uma_zone_set_zinit on non-empty keg"));
 	zone->uz_init = zinit;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
 {
 
 	ZONE_LOCK(zone);
 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
 	    ("uma_zone_set_zfini on non-empty keg"));
 	zone->uz_fini = zfini;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 /* XXX uk_freef is not actually used with the zone locked */
 void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
 	KEG_LOCK(keg);
 	keg->uk_freef = freef;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 /* XXX uk_allocf is not actually used with the zone locked */
 void
 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 {
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	KEG_LOCK(keg);
 	keg->uk_allocf = allocf;
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 void
 uma_zone_reserve(uma_zone_t zone, int items)
 {
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	if (keg == NULL)
 		return;
 	KEG_LOCK(keg);
 	keg->uk_reserve = items;
 	KEG_UNLOCK(keg);
 
 	return;
 }
 
 /* See uma.h */
 int
 uma_zone_reserve_kva(uma_zone_t zone, int count)
 {
 	uma_keg_t keg;
 	vm_offset_t kva;
 	u_int pages;
 
 	keg = zone_first_keg(zone);
 	if (keg == NULL)
 		return (0);
 	pages = count / keg->uk_ipers;
 
 	if (pages * keg->uk_ipers < count)
 		pages++;
 	pages *= keg->uk_ppera;
 
 #ifdef UMA_MD_SMALL_ALLOC
 	if (keg->uk_ppera > 1) {
 #else
 	if (1) {
 #endif
 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
 		if (kva == 0)
 			return (0);
 	} else
 		kva = 0;
 	KEG_LOCK(keg);
 	keg->uk_kva = kva;
 	keg->uk_offset = 0;
 	keg->uk_maxpages = pages;
 #ifdef UMA_MD_SMALL_ALLOC
 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
 #else
 	keg->uk_allocf = noobj_alloc;
 #endif
 	keg->uk_flags |= UMA_ZONE_NOFREE;
 	KEG_UNLOCK(keg);
 
 	return (1);
 }
 
 /* See uma.h */
 void
 uma_prealloc(uma_zone_t zone, int items)
 {
 	uma_domain_t dom;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int domain, slabs;
 
 	keg = zone_first_keg(zone);
 	if (keg == NULL)
 		return;
 	KEG_LOCK(keg);
 	slabs = items / keg->uk_ipers;
 	domain = 0;
 	if (slabs * keg->uk_ipers < items)
 		slabs++;
 	while (slabs > 0) {
 		slab = keg_alloc_slab(keg, zone, domain, M_WAITOK);
 		if (slab == NULL)
 			break;
 		MPASS(slab->us_keg == keg);
 		dom = &keg->uk_domain[slab->us_domain];
 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
 		slabs--;
 		domain = (domain + 1) % vm_ndomains;
 	}
 	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 static void
 uma_reclaim_locked(bool kmem_danger)
 {
 
 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
 	sx_assert(&uma_drain_lock, SA_XLOCKED);
 	bucket_enable();
 	zone_foreach(zone_drain);
 	if (vm_page_count_min() || kmem_danger) {
 		cache_drain_safe(NULL);
 		zone_foreach(zone_drain);
 	}
 	/*
 	 * Some slabs may have been freed but this zone will be visited early
 	 * we visit again so that we can free pages that are empty once other
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
 	bucket_zone_drain();
 }
 
 void
 uma_reclaim(void)
 {
 
 	sx_xlock(&uma_drain_lock);
 	uma_reclaim_locked(false);
 	sx_xunlock(&uma_drain_lock);
 }
 
 static volatile int uma_reclaim_needed;
 
 void
 uma_reclaim_wakeup(void)
 {
 
 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
 		wakeup(uma_reclaim);
 }
 
 void
 uma_reclaim_worker(void *arg __unused)
 {
 
 	for (;;) {
 		sx_xlock(&uma_drain_lock);
 		while (atomic_load_int(&uma_reclaim_needed) == 0)
 			sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
 			    hz);
 		sx_xunlock(&uma_drain_lock);
 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
 		sx_xlock(&uma_drain_lock);
 		uma_reclaim_locked(true);
 		atomic_store_int(&uma_reclaim_needed, 0);
 		sx_xunlock(&uma_drain_lock);
 		/* Don't fire more than once per-second. */
 		pause("umarclslp", hz);
 	}
 }
 
 /* See uma.h */
 int
 uma_zone_exhausted(uma_zone_t zone)
 {
 	int full;
 
 	ZONE_LOCK(zone);
 	full = (zone->uz_flags & UMA_ZFLAG_FULL);
 	ZONE_UNLOCK(zone);
 	return (full);	
 }
 
 int
 uma_zone_exhausted_nolock(uma_zone_t zone)
 {
 	return (zone->uz_flags & UMA_ZFLAG_FULL);
 }
 
 void *
 uma_large_malloc_domain(vm_size_t size, int domain, int wait)
 {
+	struct vmem *arena;
 	vm_offset_t addr;
 	uma_slab_t slab;
 
+#if VM_NRESERVLEVEL > 0
+	if (__predict_true((wait & M_EXEC) == 0))
+		arena = kernel_arena;
+	else
+		arena = kernel_rwx_arena;
+#else
+	arena = kernel_arena;
+#endif
+
 	slab = zone_alloc_item(slabzone, NULL, domain, wait);
 	if (slab == NULL)
 		return (NULL);
 	if (domain == UMA_ANYDOMAIN)
-		addr = kmem_malloc(kernel_arena, size, wait);
+		addr = kmem_malloc(arena, size, wait);
 	else
-		addr = kmem_malloc_domain(domain, size, wait);
+		addr = kmem_malloc_domain(arena, domain, size, wait);
 	if (addr != 0) {
 		vsetslab(addr, slab);
 		slab->us_data = (void *)addr;
 		slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
+#if VM_NRESERVLEVEL > 0
+		if (__predict_false(arena == kernel_rwx_arena))
+			slab->us_flags |= UMA_SLAB_KRWX;
+#endif
 		slab->us_size = size;
 		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
 		    pmap_kextract(addr)));
 		uma_total_inc(size);
 	} else {
 		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
 	}
 
 	return ((void *)addr);
 }
 
 void *
 uma_large_malloc(vm_size_t size, int wait)
 {
 
 	return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
 }
 
 void
 uma_large_free(uma_slab_t slab)
 {
+	struct vmem *arena;
 
 	KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
 	    ("uma_large_free:  Memory not allocated with uma_large_malloc."));
-	kmem_free(kernel_arena, (vm_offset_t)slab->us_data, slab->us_size);
+#if VM_NRESERVLEVEL > 0
+	if (__predict_true((slab->us_flags & UMA_SLAB_KRWX) == 0))
+		arena = kernel_arena;
+	else
+		arena = kernel_rwx_arena;
+#else
+	arena = kernel_arena;
+#endif
+	kmem_free(arena, (vm_offset_t)slab->us_data, slab->us_size);
 	uma_total_dec(slab->us_size);
 	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
 }
 
 static void
 uma_zero_item(void *item, uma_zone_t zone)
 {
 
 	bzero(item, zone->uz_size);
 }
 
 unsigned long
 uma_limit(void)
 {
 
 	return (uma_kmem_limit);
 }
 
 void
 uma_set_limit(unsigned long limit)
 {
 
 	uma_kmem_limit = limit;
 }
 
 unsigned long
 uma_size(void)
 {
 
 	return (uma_kmem_total);
 }
 
 long
 uma_avail(void)
 {
 
 	return (uma_kmem_limit - uma_kmem_total);
 }
 
 void
 uma_print_stats(void)
 {
 	zone_foreach(uma_print_zone);
 }
 
 static void
 slab_print(uma_slab_t slab)
 {
 	printf("slab: keg %p, data %p, freecount %d\n",
 		slab->us_keg, slab->us_data, slab->us_freecount);
 }
 
 static void
 cache_print(uma_cache_t cache)
 {
 	printf("alloc: %p(%d), free: %p(%d)\n",
 		cache->uc_allocbucket,
 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
 		cache->uc_freebucket,
 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
 }
 
 static void
 uma_print_keg(uma_keg_t keg)
 {
 	uma_domain_t dom;
 	uma_slab_t slab;
 	int i;
 
 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
 	    "out %d free %d limit %d\n",
 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
 	    keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
 	for (i = 0; i < vm_ndomains; i++) {
 		dom = &keg->uk_domain[i];
 		printf("Part slabs:\n");
 		LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
 			slab_print(slab);
 		printf("Free slabs:\n");
 		LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
 			slab_print(slab);
 		printf("Full slabs:\n");
 		LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
 			slab_print(slab);
 	}
 }
 
 void
 uma_print_zone(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_klink_t kl;
 	int i;
 
 	printf("zone: %s(%p) size %d flags %#x\n",
 	    zone->uz_name, zone, zone->uz_size, zone->uz_flags);
 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
 		uma_print_keg(kl->kl_keg);
 	CPU_FOREACH(i) {
 		cache = &zone->uz_cpu[i];
 		printf("CPU %d Cache:\n", i);
 		cache_print(cache);
 	}
 }
 
 #ifdef DDB
 /*
  * Generate statistics across both the zone and its per-cpu cache's.  Return
  * desired statistics if the pointer is non-NULL for that statistic.
  *
  * Note: does not update the zone statistics, as it can't safely clear the
  * per-CPU cache statistic.
  *
  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
  * safe from off-CPU; we should modify the caches to track this information
  * directly so that we don't have to.
  */
 static void
 uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
     uint64_t *freesp, uint64_t *sleepsp)
 {
 	uma_cache_t cache;
 	uint64_t allocs, frees, sleeps;
 	int cachefree, cpu;
 
 	allocs = frees = sleeps = 0;
 	cachefree = 0;
 	CPU_FOREACH(cpu) {
 		cache = &z->uz_cpu[cpu];
 		if (cache->uc_allocbucket != NULL)
 			cachefree += cache->uc_allocbucket->ub_cnt;
 		if (cache->uc_freebucket != NULL)
 			cachefree += cache->uc_freebucket->ub_cnt;
 		allocs += cache->uc_allocs;
 		frees += cache->uc_frees;
 	}
 	allocs += z->uz_allocs;
 	frees += z->uz_frees;
 	sleeps += z->uz_sleeps;
 	if (cachefreep != NULL)
 		*cachefreep = cachefree;
 	if (allocsp != NULL)
 		*allocsp = allocs;
 	if (freesp != NULL)
 		*freesp = frees;
 	if (sleepsp != NULL)
 		*sleepsp = sleeps;
 }
 #endif /* DDB */
 
 static int
 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
 {
 	uma_keg_t kz;
 	uma_zone_t z;
 	int count;
 
 	count = 0;
 	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 	rw_runlock(&uma_rwlock);
 	return (sysctl_handle_int(oidp, &count, 0, req));
 }
 
 static int
 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct uma_stream_header ush;
 	struct uma_type_header uth;
 	struct uma_percpu_stat *ups;
 	uma_bucket_t bucket;
 	uma_zone_domain_t zdom;
 	struct sbuf sbuf;
 	uma_cache_t cache;
 	uma_klink_t kl;
 	uma_keg_t kz;
 	uma_zone_t z;
 	uma_keg_t k;
 	int count, error, i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
 
 	count = 0;
 	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&ush, sizeof(ush));
 	ush.ush_version = UMA_STREAM_VERSION;
 	ush.ush_maxcpus = (mp_maxid + 1);
 	ush.ush_count = count;
 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
 
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			bzero(&uth, sizeof(uth));
 			ZONE_LOCK(z);
 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 			uth.uth_align = kz->uk_align;
 			uth.uth_size = kz->uk_size;
 			uth.uth_rsize = kz->uk_rsize;
 			LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
 				k = kl->kl_keg;
 				uth.uth_maxpages += k->uk_maxpages;
 				uth.uth_pages += k->uk_pages;
 				uth.uth_keg_free += k->uk_free;
 				uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
 				    * k->uk_ipers;
 			}
 
 			/*
 			 * A zone is secondary is it is not the first entry
 			 * on the keg's zone list.
 			 */
 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z))
 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
 
 			for (i = 0; i < vm_ndomains; i++) {
 				zdom = &z->uz_domain[i];
 				LIST_FOREACH(bucket, &zdom->uzd_buckets,
 				    ub_link)
 					uth.uth_zone_free += bucket->ub_cnt;
 			}
 			uth.uth_allocs = z->uz_allocs;
 			uth.uth_frees = z->uz_frees;
 			uth.uth_fails = z->uz_fails;
 			uth.uth_sleeps = z->uz_sleeps;
 			/*
 			 * While it is not normally safe to access the cache
 			 * bucket pointers while not on the CPU that owns the
 			 * cache, we only allow the pointers to be exchanged
 			 * without the zone lock held, not invalidated, so
 			 * accept the possible race associated with bucket
 			 * exchange during monitoring.
 			 */
 			for (i = 0; i < mp_maxid + 1; i++) {
 				bzero(&ups[i], sizeof(*ups));
 				if (kz->uk_flags & UMA_ZFLAG_INTERNAL ||
 				    CPU_ABSENT(i))
 					continue;
 				cache = &z->uz_cpu[i];
 				if (cache->uc_allocbucket != NULL)
 					ups[i].ups_cache_free +=
 					    cache->uc_allocbucket->ub_cnt;
 				if (cache->uc_freebucket != NULL)
 					ups[i].ups_cache_free +=
 					    cache->uc_freebucket->ub_cnt;
 				ups[i].ups_allocs = cache->uc_allocs;
 				ups[i].ups_frees = cache->uc_frees;
 			}
 			ZONE_UNLOCK(z);
 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 			for (i = 0; i < mp_maxid + 1; i++)
 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 		}
 	}
 	rw_runlock(&uma_rwlock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	free(ups, M_TEMP);
 	return (error);
 }
 
 int
 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = *(uma_zone_t *)arg1;
 	int error, max;
 
 	max = uma_zone_get_max(zone);
 	error = sysctl_handle_int(oidp, &max, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	uma_zone_set_max(zone, max);
 
 	return (0);
 }
 
 int
 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = *(uma_zone_t *)arg1;
 	int cur;
 
 	cur = uma_zone_get_cur(zone);
 	return (sysctl_handle_int(oidp, &cur, 0, req));
 }
 
 #ifdef INVARIANTS
 static uma_slab_t
 uma_dbg_getslab(uma_zone_t zone, void *item)
 {
 	uma_slab_t slab;
 	uma_keg_t keg;
 	uint8_t *mem;
 
 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
 		slab = vtoslab((vm_offset_t)mem);
 	} else {
 		/*
 		 * It is safe to return the slab here even though the
 		 * zone is unlocked because the item's allocation state
 		 * essentially holds a reference.
 		 */
 		ZONE_LOCK(zone);
 		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
 		if (keg->uk_flags & UMA_ZONE_HASH)
 			slab = hash_sfind(&keg->uk_hash, mem);
 		else
 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
 		ZONE_UNLOCK(zone);
 	}
 
 	return (slab);
 }
 
 static bool
 uma_dbg_zskip(uma_zone_t zone, void *mem)
 {
 	uma_keg_t keg;
 
 	if ((keg = zone_first_keg(zone)) == NULL)
 		return (true);
 
 	return (uma_dbg_kskip(keg, mem));
 }
 
 static bool
 uma_dbg_kskip(uma_keg_t keg, void *mem)
 {
 	uintptr_t idx;
 
 	if (dbg_divisor == 0)
 		return (true);
 
 	if (dbg_divisor == 1)
 		return (false);
 
 	idx = (uintptr_t)mem >> PAGE_SHIFT;
 	if (keg->uk_ipers > 1) {
 		idx *= keg->uk_ipers;
 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
 	}
 
 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
 		counter_u64_add(uma_skip_cnt, 1);
 		return (true);
 	}
 	counter_u64_add(uma_dbg_cnt, 1);
 
 	return (false);
 }
 
 /*
  * Set up the slab's freei data such that uma_dbg_free can function.
  *
  */
 static void
 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
 	int freei;
 
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
 			panic("uma: item %p did not belong to zone %s\n",
 			    item, zone->uz_name);
 	}
 	keg = slab->us_keg;
 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
 
 	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
 		    item, zone, zone->uz_name, slab, freei);
 	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
 
 	return;
 }
 
 /*
  * Verifies freed addresses.  Checks for alignment, valid slab membership
  * and duplicate frees.
  *
  */
 static void
 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
 	int freei;
 
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
 			panic("uma: Freed item %p did not belong to zone %s\n",
 			    item, zone->uz_name);
 	}
 	keg = slab->us_keg;
 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
 
 	if (freei >= keg->uk_ipers)
 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
 		    item, zone, zone->uz_name, slab, freei);
 
 	if (((freei * keg->uk_rsize) + slab->us_data) != item) 
 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
 		    item, zone, zone->uz_name, slab, freei);
 
 	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
 		    item, zone, zone->uz_name, slab, freei);
 
 	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
 }
 #endif /* INVARIANTS */
 
 #ifdef DDB
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
 	uma_bucket_t bucket;
 	uma_keg_t kz;
 	uma_zone_t z;
 	uma_zone_domain_t zdom;
 	uint64_t allocs, frees, sleeps;
 	int cachefree, i;
 
 	db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
 	    "Free", "Requests", "Sleeps", "Bucket");
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
 				allocs = z->uz_allocs;
 				frees = z->uz_frees;
 				sleeps = z->uz_sleeps;
 				cachefree = 0;
 			} else
 				uma_zone_sumstat(z, &cachefree, &allocs,
 				    &frees, &sleeps);
 			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z)))
 				cachefree += kz->uk_free;
 			for (i = 0; i < vm_ndomains; i++) {
 				zdom = &z->uz_domain[i];
 				LIST_FOREACH(bucket, &zdom->uzd_buckets,
 				    ub_link)
 					cachefree += bucket->ub_cnt;
 			}
 			db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n",
 			    z->uz_name, (uintmax_t)kz->uk_size,
 			    (intmax_t)(allocs - frees), cachefree,
 			    (uintmax_t)allocs, sleeps, z->uz_count);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(umacache, db_show_umacache)
 {
 	uma_bucket_t bucket;
 	uma_zone_t z;
 	uma_zone_domain_t zdom;
 	uint64_t allocs, frees;
 	int cachefree, i;
 
 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
 	    "Requests", "Bucket");
 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
 		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
 		for (i = 0; i < vm_ndomains; i++) {
 			zdom = &z->uz_domain[i];
 			LIST_FOREACH(bucket, &zdom->uzd_buckets, ub_link)
 				cachefree += bucket->ub_cnt;
 		}
 		db_printf("%18s %8ju %8jd %8d %12ju %8u\n",
 		    z->uz_name, (uintmax_t)z->uz_size,
 		    (intmax_t)(allocs - frees), cachefree,
 		    (uintmax_t)allocs, z->uz_count);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif	/* DDB */
Index: head/sys/vm/vm_extern.h
===================================================================
--- head/sys/vm/vm_extern.h	(revision 335067)
+++ head/sys/vm/vm_extern.h	(revision 335068)
@@ -1,130 +1,131 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vm_extern.h	8.2 (Berkeley) 1/12/94
  * $FreeBSD$
  */
 
 #ifndef _VM_EXTERN_H_
 #define	_VM_EXTERN_H_
 
 struct pmap;
 struct proc;
 struct vmspace;
 struct vnode;
 struct vmem;
 
 #ifdef _KERNEL
 struct cdev;
 struct cdevsw;
 
 /* These operate on kernel virtual addresses only. */
 vm_offset_t kva_alloc(vm_size_t);
 void kva_free(vm_offset_t, vm_size_t);
 
 /* These operate on pageable virtual addresses. */
 vm_offset_t kmap_alloc_wait(vm_map_t, vm_size_t);
 void kmap_free_wakeup(vm_map_t, vm_offset_t, vm_size_t);
 
 /* These operate on virtual addresses backed by memory. */
 vm_offset_t kmem_alloc_attr(struct vmem *, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr);
 vm_offset_t kmem_alloc_attr_domain(int domain, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr);
 vm_offset_t kmem_alloc_contig(struct vmem *, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr);
 vm_offset_t kmem_alloc_contig_domain(int domain, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr);
 vm_offset_t kmem_malloc(struct vmem *, vm_size_t size, int flags);
-vm_offset_t kmem_malloc_domain(int domain, vm_size_t size, int flags);
+vm_offset_t kmem_malloc_domain(struct vmem *, int domain, vm_size_t size,
+    int flags);
 void kmem_free(struct vmem *, vm_offset_t, vm_size_t);
 
 /* This provides memory for previously allocated address space. */
 int kmem_back(vm_object_t, vm_offset_t, vm_size_t, int);
 int kmem_back_domain(int, vm_object_t, vm_offset_t, vm_size_t, int);
 void kmem_unback(vm_object_t, vm_offset_t, vm_size_t);
 
 /* Bootstrapping. */
 vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t,
     boolean_t);
 void kmem_init(vm_offset_t, vm_offset_t);
 void kmem_init_zero_region(void);
 void kmeminit(void);
 
 int kernacc(void *, int, int);
 int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
 void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
     vm_ooffset_t *);
 int vm_fault_disable_pagefaults(void);
 void vm_fault_enable_pagefaults(int save);
 int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold);
 int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count);
 int vm_forkproc(struct thread *, struct proc *, struct thread *,
     struct vmspace *, int);
 void vm_waitproc(struct proc *);
 int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int,
     objtype_t, void *, vm_ooffset_t);
 int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t,
     vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *);
 int vm_mmap_to_errno(int rv);
 int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *);
 int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *,
     struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
 void vm_set_page_size(void);
 void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
 typedef int (*pmap_pinit_t)(struct pmap *pmap);
 struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t);
 struct vmspace *vmspace_fork(struct vmspace *, vm_ooffset_t *);
 int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
 int vmspace_unshare(struct proc *);
 void vmspace_exit(struct thread *);
 struct vmspace *vmspace_acquire_ref(struct proc *);
 void vmspace_free(struct vmspace *);
 void vmspace_exitfree(struct proc *);
 void vmspace_switch_aio(struct vmspace *);
 void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
 int vslock(void *, size_t);
 void vsunlock(void *, size_t);
 struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset);
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 int vm_thread_new(struct thread *td, int pages);
 u_int vm_active_count(void);
 u_int vm_inactive_count(void);
 u_int vm_laundry_count(void);
 u_int vm_wait_count(void);
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */
Index: head/sys/vm/vm_init.c
===================================================================
--- head/sys/vm/vm_init.c	(revision 335067)
+++ head/sys/vm/vm_init.c	(revision 335068)
@@ -1,316 +1,352 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_init.c	8.1 (Berkeley) 6/11/93
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Initialize the Virtual Memory subsystem.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/selinfo.h>
 #include <sys/smp.h>
 #include <sys/pipe.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 extern void	uma_startup1(void);
 extern void	uma_startup2(void);
 extern void	vm_radix_reserve_kva(void);
 
 #if VM_NRESERVLEVEL > 0
 #define	KVA_QUANTUM	(1 << (VM_LEVEL_0_ORDER + PAGE_SHIFT))
 #else
 	/* On non-superpage architectures want large import sizes. */
 #define	KVA_QUANTUM	(PAGE_SIZE * 1024)
 #endif
 long physmem;
 
 /*
  * System initialization
  */
 static void vm_mem_init(void *);
 SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_FIRST, vm_mem_init, NULL);
 
 /*
  * Import kva into the kernel arena.
  */
 static int
 kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp)
 {
 	vm_offset_t addr;
 	int result;
 
 	KASSERT((size % KVA_QUANTUM) == 0,
 	    ("kva_import: Size %jd is not a multiple of %d",
 	    (intmax_t)size, (int)KVA_QUANTUM));
 	addr = vm_map_min(kernel_map);
 	result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0,
 	    VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 	if (result != KERN_SUCCESS)
                 return (ENOMEM);
 
 	*addrp = addr;
 
 	return (0);
 }
 
+#if VM_NRESERVLEVEL > 0
 /*
+ * Import a superpage from the normal kernel arena into the special
+ * arena for allocations with different permissions.
+ */
+static int
+kernel_rwx_alloc(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp)
+{
+
+	KASSERT((size % KVA_QUANTUM) == 0,
+	    ("kernel_rwx_alloc: Size %jd is not a multiple of %d",
+	    (intmax_t)size, (int)KVA_QUANTUM));
+	return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN,
+	    VMEM_ADDR_MAX, flags, addrp));
+}
+#endif
+
+/*
  *	vm_init initializes the virtual memory system.
  *	This is done only by the first cpu up.
  *
  *	The start and end address of physical memory is passed in.
  */
 /* ARGSUSED*/
 static void
 vm_mem_init(dummy)
 	void *dummy;
 {
 	int domain;
 
 	/*
 	 * Initializes resident memory structures. From here on, all physical
 	 * memory is accounted for, and we use only virtual addresses.
 	 */
 	vm_set_page_size();
 	virtual_avail = vm_page_startup(virtual_avail);
 
 #ifdef	UMA_MD_SMALL_ALLOC
 	/* Announce page availability to UMA. */
 	uma_startup1();
 #endif
 	/*
 	 * Initialize other VM packages
 	 */
 	vmem_startup();
 	vm_object_init();
 	vm_map_startup();
 	kmem_init(virtual_avail, virtual_end);
 
 	/*
 	 * Initialize the kernel_arena.  This can grow on demand.
 	 */
 	vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0);
 	vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM);
 
+#if VM_NRESERVLEVEL > 0
+	/*
+	 * In an architecture with superpages, maintain a separate arena
+	 * for allocations with permissions that differ from the "standard"
+	 * read/write permissions used for memory in the kernel_arena.
+	 */
+	kernel_rwx_arena = vmem_create("kernel rwx arena", 0, 0, PAGE_SIZE,
+	    0, M_WAITOK);
+	vmem_set_import(kernel_rwx_arena, kernel_rwx_alloc,
+	    (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM);
+#endif
+
 	for (domain = 0; domain < vm_ndomains; domain++) {
 		vm_dom[domain].vmd_kernel_arena = vmem_create(
 		    "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK);
 		vmem_set_import(vm_dom[domain].vmd_kernel_arena,
 		    (vmem_import_t *)vmem_alloc, NULL, kernel_arena,
 		    KVA_QUANTUM);
+#if VM_NRESERVLEVEL > 0
+		vm_dom[domain].vmd_kernel_rwx_arena = vmem_create(
+		    "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK);
+		vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena,
+		    kernel_rwx_alloc, (vmem_release_t *)vmem_xfree,
+		    vm_dom[domain].vmd_kernel_arena, KVA_QUANTUM);
+#endif
 	}
 
 #ifndef	UMA_MD_SMALL_ALLOC
 	/* Set up radix zone to use noobj_alloc. */
 	vm_radix_reserve_kva();
 #endif
 	/* Announce full page availability to UMA. */
 	uma_startup2();
 	kmem_init_zero_region();
 	pmap_init();
 	vm_pager_init();
 }
 
 void
 vm_ksubmap_init(struct kva_md_info *kmi)
 {
 	vm_offset_t firstaddr;
 	caddr_t v;
 	vm_size_t size = 0;
 	long physmem_est;
 	vm_offset_t minaddr;
 	vm_offset_t maxaddr;
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 	/*
 	 * Discount the physical memory larger than the size of kernel_map
 	 * to avoid eating up all of KVA space.
 	 */
 	physmem_est = lmin(physmem, btoc(kernel_map->max_offset -
 	    kernel_map->min_offset));
 
 	v = kern_vfs_bio_buffer_alloc(v, physmem_est);
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)v;
 #ifdef VM_FREELIST_DMA32
 		/*
 		 * Try to protect 32-bit DMAable memory from the largest
 		 * early alloc of wired mem.
 		 */
 		firstaddr = kmem_alloc_attr(kernel_arena, size,
 		    M_ZERO | M_NOWAIT, (vm_paddr_t)1 << 32,
 		    ~(vm_paddr_t)0, VM_MEMATTR_DEFAULT);
 		if (firstaddr == 0)
 #endif
 			firstaddr = kmem_malloc(kernel_arena, size,
 			    M_ZERO | M_WAITOK);
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)((char *)v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 	/*
 	 * Allocate the clean map to hold all of the paging and I/O virtual
 	 * memory.
 	 */
 	size = (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS +
 	    (long)bio_transient_maxcnt * MAXPHYS;
 	kmi->clean_sva = firstaddr = kva_alloc(size);
 	kmi->clean_eva = firstaddr + size;
 
 	/*
 	 * Allocate the buffer arena.
 	 *
 	 * Enable the quantum cache if we have more than 4 cpus.  This
 	 * avoids lock contention at the expense of some fragmentation.
 	 */
 	size = (long)nbuf * BKVASIZE;
 	kmi->buffer_sva = firstaddr;
 	kmi->buffer_eva = kmi->buffer_sva + size;
 	vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size,
 	    PAGE_SIZE, (mp_ncpus > 4) ? BKVASIZE * 8 : 0, 0);
 	firstaddr += size;
 
 	/*
 	 * Now swap kva.
 	 */
 	swapbkva = firstaddr;
 	size = (long)nswbuf * MAXPHYS;
 	firstaddr += size;
 
 	/*
 	 * And optionally transient bio space.
 	 */
 	if (bio_transient_maxcnt != 0) {
 		size = (long)bio_transient_maxcnt * MAXPHYS;
 		vmem_init(transient_arena, "transient arena",
 		    firstaddr, size, PAGE_SIZE, 0, 0);
 		firstaddr += size;
 	}
 	if (firstaddr != kmi->clean_eva)
 		panic("Clean map calculation incorrect");
 
 	/*
 	 * Allocate the pageable submaps.  We may cache an exec map entry per
 	 * CPU, so we therefore need to reserve space for at least ncpu+1
 	 * entries to avoid deadlock.  The exec map is also used by some image
 	 * activators, so we leave a fixed number of pages for their use.
 	 */
 #ifdef __LP64__
 	exec_map_entries = 8 * mp_ncpus;
 #else
 	exec_map_entries = 2 * mp_ncpus + 4;
 #endif
 	exec_map_entry_size = round_page(PATH_MAX + ARG_MAX);
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 	    exec_map_entries * exec_map_entry_size + 64 * PAGE_SIZE, FALSE);
 	pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva,
 	    FALSE);
 }
Index: head/sys/vm/vm_kern.c
===================================================================
--- head/sys/vm/vm_kern.c	(revision 335067)
+++ head/sys/vm/vm_kern.c	(revision 335068)
@@ -1,688 +1,714 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_kern.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Kernel memory management.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>		/* for ticks and hz */
 #include <sys/domainset.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_domainset.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 vm_map_t kernel_map;
 vm_map_t exec_map;
 vm_map_t pipe_map;
 
 const void *zero_region;
 CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);
 
 /* NB: Used by kernel debuggers. */
 const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS;
 
 u_int exec_map_entry_size;
 u_int exec_map_entries;
 
 SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
     SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
 
 SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
 #if defined(__arm__) || defined(__sparc64__)
     &vm_max_kernel_address, 0,
 #else
     SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS,
 #endif
     "Max kernel address");
 
 /*
  *	kva_alloc:
  *
  *	Allocate a virtual address range with no underlying object and
  *	no initial mapping to physical memory.  Any mapping from this
  *	range to physical memory must be explicitly created prior to
  *	its use, typically with pmap_qenter().  Any attempt to create
  *	a mapping on demand through vm_fault() will result in a panic. 
  */
 vm_offset_t
 kva_alloc(vm_size_t size)
 {
 	vm_offset_t addr;
 
 	size = round_page(size);
 	if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr))
 		return (0);
 
 	return (addr);
 }
 
 /*
  *	kva_free:
  *
  *	Release a region of kernel virtual memory allocated
  *	with kva_alloc, and return the physical pages
  *	associated with that region.
  *
  *	This routine may not block on kernel maps.
  */
 void
 kva_free(vm_offset_t addr, vm_size_t size)
 {
 
 	size = round_page(size);
 	vmem_free(kernel_arena, addr, size);
 }
 
 /*
  *	Allocates a region from the kernel address map and physical pages
  *	within the specified address range to the kernel object.  Creates a
  *	wired mapping from this region to these pages, and returns the
  *	region's starting virtual address.  The allocated pages are not
  *	necessarily physically contiguous.  If M_ZERO is specified through the
  *	given flags, then the pages are zeroed before they are mapped.
  */
 vm_offset_t
 kmem_alloc_attr_domain(int domain, vm_size_t size, int flags, vm_paddr_t low,
     vm_paddr_t high, vm_memattr_t memattr)
 {
 	vmem_t *vmem;
 	vm_object_t object = kernel_object;
 	vm_offset_t addr, i, offset;
 	vm_page_t m;
 	int pflags, tries;
 
 	size = round_page(size);
 	vmem = vm_dom[domain].vmd_kernel_arena;
 	if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr))
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
 	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
 	pflags |= VM_ALLOC_NOWAIT;
 	VM_OBJECT_WLOCK(object);
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		tries = 0;
 retry:
 		m = vm_page_alloc_contig_domain(object, atop(offset + i),
 		    domain, pflags, 1, low, high, PAGE_SIZE, 0, memattr);
 		if (m == NULL) {
 			VM_OBJECT_WUNLOCK(object);
 			if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
 				if (!vm_page_reclaim_contig_domain(domain,
 				    pflags, 1, low, high, PAGE_SIZE, 0) &&
 				    (flags & M_WAITOK) != 0)
 					vm_wait_domain(domain);
 				VM_OBJECT_WLOCK(object);
 				tries++;
 				goto retry;
 			}
 			kmem_unback(object, addr, i);
 			vmem_free(vmem, addr, size);
 			return (0);
 		}
 		KASSERT(vm_phys_domain(m) == domain,
 		    ("kmem_alloc_attr_domain: Domain mismatch %d != %d",
 		    vm_phys_domain(m), domain));
 		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
 			pmap_zero_page(m);
 		m->valid = VM_PAGE_BITS_ALL;
-		pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL,
-		    VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
+		pmap_enter(kernel_pmap, addr + i, m, VM_PROT_RW,
+		    VM_PROT_RW | PMAP_ENTER_WIRED, 0);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (addr);
 }
 
 vm_offset_t
 kmem_alloc_attr(vmem_t *vmem, vm_size_t size, int flags, vm_paddr_t low,
     vm_paddr_t high, vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
 	vm_offset_t addr;
 	int domain;
 
 	KASSERT(vmem == kernel_arena,
 	    ("kmem_alloc_attr: Only kernel_arena is supported."));
 
 	vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags);
 	do {
 		addr = kmem_alloc_attr_domain(domain, size, flags, low, high,
 		    memattr);
 		if (addr != 0)
 			break;
 	} while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0);
 
 	return (addr);
 }
 
 /*
  *	Allocates a region from the kernel address map and physically
  *	contiguous pages within the specified address range to the kernel
  *	object.  Creates a wired mapping from this region to these pages, and
  *	returns the region's starting virtual address.  If M_ZERO is specified
  *	through the given flags, then the pages are zeroed before they are
  *	mapped.
  */
 vm_offset_t
 kmem_alloc_contig_domain(int domain, vm_size_t size, int flags, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr)
 {
 	vmem_t *vmem;
 	vm_object_t object = kernel_object;
 	vm_offset_t addr, offset, tmp;
 	vm_page_t end_m, m;
 	u_long npages;
 	int pflags, tries;
  
 	size = round_page(size);
 	vmem = vm_dom[domain].vmd_kernel_arena;
 	if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
 	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
 	pflags |= VM_ALLOC_NOWAIT;
 	npages = atop(size);
 	VM_OBJECT_WLOCK(object);
 	tries = 0;
 retry:
 	m = vm_page_alloc_contig_domain(object, atop(offset), domain, pflags,
 	    npages, low, high, alignment, boundary, memattr);
 	if (m == NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
 			if (!vm_page_reclaim_contig_domain(domain, pflags,
 			    npages, low, high, alignment, boundary) &&
 			    (flags & M_WAITOK) != 0)
 				vm_wait_domain(domain);
 			VM_OBJECT_WLOCK(object);
 			tries++;
 			goto retry;
 		}
 		vmem_free(vmem, addr, size);
 		return (0);
 	}
 	KASSERT(vm_phys_domain(m) == domain,
 	    ("kmem_alloc_contig_domain: Domain mismatch %d != %d",
 	    vm_phys_domain(m), domain));
 	end_m = m + npages;
 	tmp = addr;
 	for (; m < end_m; m++) {
 		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
 			pmap_zero_page(m);
 		m->valid = VM_PAGE_BITS_ALL;
-		pmap_enter(kernel_pmap, tmp, m, VM_PROT_ALL,
-		    VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
+		pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW,
+		    VM_PROT_RW | PMAP_ENTER_WIRED, 0);
 		tmp += PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (addr);
 }
 
 vm_offset_t
 kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
 	vm_offset_t addr;
 	int domain;
 
 	KASSERT(vmem == kernel_arena,
 	    ("kmem_alloc_contig: Only kernel_arena is supported."));
 
 	vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags);
 	do {
 		addr = kmem_alloc_contig_domain(domain, size, flags, low, high,
 		    alignment, boundary, memattr);
 		if (addr != 0)
 			break;
 	} while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0);
 
 	return (addr);
 }
 
 /*
  *	kmem_suballoc:
  *
  *	Allocates a map to manage a subrange
  *	of the kernel virtual address space.
  *
  *	Arguments are as follows:
  *
  *	parent		Map to take range from
  *	min, max	Returned endpoints of map
  *	size		Size of range to find
  *	superpage_align	Request that min is superpage aligned
  */
 vm_map_t
 kmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max,
     vm_size_t size, boolean_t superpage_align)
 {
 	int ret;
 	vm_map_t result;
 
 	size = round_page(size);
 
 	*min = vm_map_min(parent);
 	ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ?
 	    VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
 	    MAP_ACC_NO_CHARGE);
 	if (ret != KERN_SUCCESS)
 		panic("kmem_suballoc: bad status return of %d", ret);
 	*max = *min + size;
 	result = vm_map_create(vm_map_pmap(parent), *min, *max);
 	if (result == NULL)
 		panic("kmem_suballoc: cannot create submap");
 	if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS)
 		panic("kmem_suballoc: unable to change range to submap");
 	return (result);
 }
 
 /*
  *	kmem_malloc:
  *
  *	Allocate wired-down pages in the kernel's address space.
  */
 vm_offset_t
-kmem_malloc_domain(int domain, vm_size_t size, int flags)
+kmem_malloc_domain(struct vmem *vmem, int domain, vm_size_t size, int flags)
 {
-	vmem_t *vmem;
+	vmem_t *arena;
 	vm_offset_t addr;
 	int rv;
 
-	vmem = vm_dom[domain].vmd_kernel_arena;
+#if VM_NRESERVLEVEL > 0
+	KASSERT(vmem == kernel_arena || vmem == kernel_rwx_arena,
+	    ("kmem_malloc_domain: Only kernel_arena or kernel_rwx_arena "
+	    "are supported."));
+	if (__predict_true(vmem == kernel_arena))
+		arena = vm_dom[domain].vmd_kernel_arena;
+	else
+		arena = vm_dom[domain].vmd_kernel_rwx_arena;
+#else
+	KASSERT(vmem == kernel_arena,
+	    ("kmem_malloc_domain: Only kernel_arena is supported."));
+	arena = vm_dom[domain].vmd_kernel_arena;
+#endif
 	size = round_page(size);
-	if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
+	if (vmem_alloc(arena, size, flags | M_BESTFIT, &addr))
 		return (0);
 
 	rv = kmem_back_domain(domain, kernel_object, addr, size, flags);
 	if (rv != KERN_SUCCESS) {
-		vmem_free(vmem, addr, size);
+		vmem_free(arena, addr, size);
 		return (0);
 	}
 	return (addr);
 }
 
 vm_offset_t
 kmem_malloc(struct vmem *vmem, vm_size_t size, int flags)
 {
 	struct vm_domainset_iter di;
 	vm_offset_t addr;
 	int domain;
 
-	KASSERT(vmem == kernel_arena,
-	    ("kmem_malloc: Only kernel_arena is supported."));
-
 	vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags);
 	do {
-		addr = kmem_malloc_domain(domain, size, flags);
+		addr = kmem_malloc_domain(vmem, domain, size, flags);
 		if (addr != 0)
 			break;
 	} while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0);
 
 	return (addr);
 }
 
 /*
  *	kmem_back:
  *
  *	Allocate physical pages for the specified virtual address range.
  */
 int
 kmem_back_domain(int domain, vm_object_t object, vm_offset_t addr,
     vm_size_t size, int flags)
 {
 	vm_offset_t offset, i;
 	vm_page_t m, mpred;
+	vm_prot_t prot;
 	int pflags;
 
 	KASSERT(object == kernel_object,
 	    ("kmem_back_domain: only supports kernel object."));
 
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
 	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
 	if (flags & M_WAITOK)
 		pflags |= VM_ALLOC_WAITFAIL;
+	prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW;
 
 	i = 0;
 	VM_OBJECT_WLOCK(object);
 retry:
 	mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i));
 	for (; i < size; i += PAGE_SIZE, mpred = m) {
 		m = vm_page_alloc_domain_after(object, atop(offset + i),
 		    domain, pflags, mpred);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
 		 * to lock page queues here as we know that the pages we got
 		 * aren't on any queues.
 		 */
 		if (m == NULL) {
 			if ((flags & M_NOWAIT) == 0)
 				goto retry;
 			VM_OBJECT_WUNLOCK(object);
 			kmem_unback(object, addr, i);
 			return (KERN_NO_SPACE);
 		}
 		KASSERT(vm_phys_domain(m) == domain,
 		    ("kmem_back_domain: Domain mismatch %d != %d",
 		    vm_phys_domain(m), domain));
 		if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
 			pmap_zero_page(m);
 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
 		    ("kmem_malloc: page %p is managed", m));
 		m->valid = VM_PAGE_BITS_ALL;
-		pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL,
-		    VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
+		pmap_enter(kernel_pmap, addr + i, m, prot,
+		    prot | PMAP_ENTER_WIRED, 0);
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	return (KERN_SUCCESS);
 }
 
 int
 kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
 {
 	struct vm_domainset_iter di;
 	int domain;
 	int ret;
 
 	KASSERT(object == kernel_object,
 	    ("kmem_back: only supports kernel object."));
 
 	vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags);
 	do {
 		ret = kmem_back_domain(domain, object, addr, size, flags);
 		if (ret == KERN_SUCCESS)
 			break;
 	} while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0);
 
 	return (ret);
 }
 
 /*
  *	kmem_unback:
  *
  *	Unmap and free the physical pages underlying the specified virtual
  *	address range.
  *
  *	A physical page must exist within the specified object at each index
  *	that is being unmapped.
  */
 static int
 _kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
 {
 	vm_page_t m, next;
 	vm_offset_t end, offset;
 	int domain;
 
 	KASSERT(object == kernel_object,
 	    ("kmem_unback: only supports kernel object."));
 
 	if (size == 0)
 		return (0);
 	pmap_remove(kernel_pmap, addr, addr + size);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	end = offset + size;
 	VM_OBJECT_WLOCK(object);
 	m = vm_page_lookup(object, atop(offset)); 
 	domain = vm_phys_domain(m);
 	for (; offset < end; offset += PAGE_SIZE, m = next) {
 		next = vm_page_next(m);
 		vm_page_unwire(m, PQ_NONE);
 		vm_page_free(m);
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	return (domain);
 }
 
 void
 kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
 {
 
 	_kmem_unback(object, addr, size);
 }
 
 /*
  *	kmem_free:
  *
  *	Free memory allocated with kmem_malloc.  The size must match the
  *	original allocation.
  */
 void
 kmem_free(struct vmem *vmem, vm_offset_t addr, vm_size_t size)
 {
+	struct vmem *arena;
 	int domain;
 
+#if VM_NRESERVLEVEL > 0
+	KASSERT(vmem == kernel_arena || vmem == kernel_rwx_arena,
+	    ("kmem_free: Only kernel_arena or kernel_rwx_arena are supported."));
+#else
 	KASSERT(vmem == kernel_arena,
 	    ("kmem_free: Only kernel_arena is supported."));
+#endif
+
 	size = round_page(size);
 	domain = _kmem_unback(kernel_object, addr, size);
-	vmem_free(vm_dom[domain].vmd_kernel_arena, addr, size);
+#if VM_NRESERVLEVEL > 0
+	if (__predict_true(vmem == kernel_arena))
+		arena = vm_dom[domain].vmd_kernel_arena;
+	else
+		arena = vm_dom[domain].vmd_kernel_rwx_arena;
+#else
+	arena = vm_dom[domain].vmd_kernel_arena;
+#endif
+	vmem_free(arena, addr, size);
 }
 
 /*
  *	kmap_alloc_wait:
  *
  *	Allocates pageable memory from a sub-map of the kernel.  If the submap
  *	has no room, the caller sleeps waiting for more memory in the submap.
  *
  *	This routine may block.
  */
 vm_offset_t
 kmap_alloc_wait(vm_map_t map, vm_size_t size)
 {
 	vm_offset_t addr;
 
 	size = round_page(size);
 	if (!swap_reserve(size))
 		return (0);
 
 	for (;;) {
 		/*
 		 * To make this work for more than one map, use the map's lock
 		 * to lock out sleepers/wakers.
 		 */
 		vm_map_lock(map);
 		if (vm_map_findspace(map, vm_map_min(map), size, &addr) == 0)
 			break;
 		/* no space now; see if we can ever get space */
 		if (vm_map_max(map) - vm_map_min(map) < size) {
 			vm_map_unlock(map);
 			swap_release(size);
 			return (0);
 		}
 		map->needs_wakeup = TRUE;
 		vm_map_unlock_and_wait(map, 0);
 	}
 	vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL,
 	    VM_PROT_ALL, MAP_ACC_CHARGED);
 	vm_map_unlock(map);
 	return (addr);
 }
 
 /*
  *	kmap_free_wakeup:
  *
  *	Returns memory to a submap of the kernel, and wakes up any processes
  *	waiting for memory in that map.
  */
 void
 kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size)
 {
 
 	vm_map_lock(map);
 	(void) vm_map_delete(map, trunc_page(addr), round_page(addr + size));
 	if (map->needs_wakeup) {
 		map->needs_wakeup = FALSE;
 		vm_map_wakeup(map);
 	}
 	vm_map_unlock(map);
 }
 
 void
 kmem_init_zero_region(void)
 {
 	vm_offset_t addr, i;
 	vm_page_t m;
 
 	/*
 	 * Map a single physical page of zeros to a larger virtual range.
 	 * This requires less looping in places that want large amounts of
 	 * zeros, while not using much more physical resources.
 	 */
 	addr = kva_alloc(ZERO_REGION_SIZE);
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE)
 		pmap_qenter(addr + i, &m, 1);
 	pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ);
 
 	zero_region = (const void *)addr;
 }
 
 /*
  * 	kmem_init:
  *
  *	Create the kernel map; insert a mapping covering kernel text, 
  *	data, bss, and all space allocated thus far (`boostrap' data).  The 
  *	new map will thus map the range between VM_MIN_KERNEL_ADDRESS and 
  *	`start' as allocated, and the range between `start' and `end' as free.
  */
 void
 kmem_init(vm_offset_t start, vm_offset_t end)
 {
 	vm_map_t m;
 
 	m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end);
 	m->system_map = 1;
 	vm_map_lock(m);
 	/* N.B.: cannot use kgdb to debug, starting with this assignment ... */
 	kernel_map = m;
 	(void) vm_map_insert(m, NULL, (vm_ooffset_t) 0,
 #ifdef __amd64__
 	    KERNBASE,
 #else		     
 	    VM_MIN_KERNEL_ADDRESS,
 #endif
 	    start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 	/* ... and ending with the completion of the above `insert' */
 	vm_map_unlock(m);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Allow userspace to directly trigger the VM drain routine for testing
  * purposes.
  */
 static int
 debug_vm_lowmem(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	i = 0;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error)
 		return (error);
 	if ((i & ~(VM_LOW_KMEM | VM_LOW_PAGES)) != 0)
 		return (EINVAL);
 	if (i != 0)
 		EVENTHANDLER_INVOKE(vm_lowmem, i);
 	return (0);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
     debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags");
 #endif
Index: head/sys/vm/vm_kern.h
===================================================================
--- head/sys/vm/vm_kern.h	(revision 335067)
+++ head/sys/vm/vm_kern.h	(revision 335068)
@@ -1,82 +1,83 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_kern.h	8.1 (Berkeley) 6/11/93
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 #ifndef _VM_VM_KERN_H_
 #define	_VM_VM_KERN_H_
 
 /* Kernel memory management definitions. */
 extern vm_map_t kernel_map;
 extern vm_map_t exec_map;
 extern vm_map_t pipe_map;
 extern struct vmem *kernel_arena;
+extern struct vmem *kernel_rwx_arena;
 extern struct vmem *kmem_arena;
 extern struct vmem *buffer_arena;
 extern struct vmem *transient_arena;
 extern struct vmem *memguard_arena;
 extern vm_offset_t swapbkva;
 extern u_long vm_kmem_size;
 extern u_int exec_map_entries;
 extern u_int exec_map_entry_size;
 
 #endif /* _VM_VM_KERN_H_ */
Index: head/sys/vm/vm_pagequeue.h
===================================================================
--- head/sys/vm/vm_pagequeue.h	(revision 335067)
+++ head/sys/vm/vm_pagequeue.h	(revision 335068)
@@ -1,310 +1,311 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.h	8.2 (Berkeley) 12/13/93
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 #ifndef	_VM_PAGEQUEUE_
 #define	_VM_PAGEQUEUE_
 
 #ifdef _KERNEL
 struct vm_pagequeue {
 	struct mtx	pq_mutex;
 	struct pglist	pq_pl;
 	int		pq_cnt;
 	const char	* const pq_name;
 } __aligned(CACHE_LINE_SIZE);
 
 #ifndef VM_BATCHQUEUE_SIZE
 #define	VM_BATCHQUEUE_SIZE	7
 #endif
 
 struct vm_batchqueue {
 	vm_page_t	bq_pa[VM_BATCHQUEUE_SIZE];
 	int		bq_cnt;
 } __aligned(CACHE_LINE_SIZE);
 
 #include <vm/uma.h>
 #include <sys/pidctrl.h>
 struct sysctl_oid;
 
 /*
  * One vm_domain per-numa domain.  Contains pagequeues, free page structures,
  * and accounting.
  *
  * Lock Key:
  * f	vmd_free_mtx
  * p	vmd_pageout_mtx
  * d	vm_domainset_lock
  * a	atomic
  * c	const after boot
  * q	page queue lock
 */
 struct vm_domain {
 	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
 	struct mtx_padalign vmd_free_mtx;
 	struct mtx_padalign vmd_pageout_mtx;
 	uma_zone_t vmd_pgcache;		/* (c) page free cache. */
-	struct vmem *vmd_kernel_arena;	/* (c) per-domain kva arena. */
+	struct vmem *vmd_kernel_arena;	/* (c) per-domain kva R/W arena. */
+	struct vmem *vmd_kernel_rwx_arena; /* (c) per-domain kva R/W/X arena. */
 	u_int vmd_domain;		/* (c) Domain number. */
 	u_int vmd_page_count;		/* (c) Total page count. */
 	long vmd_segs;			/* (c) bitmask of the segments */
 	u_int __aligned(CACHE_LINE_SIZE) vmd_free_count; /* (a,f) free page count */
 	u_int vmd_pageout_deficit;	/* (a) Estimated number of pages deficit */
 	uint8_t vmd_pad[CACHE_LINE_SIZE - (sizeof(u_int) * 2)];
 
 	/* Paging control variables, used within single threaded page daemon. */
 	struct pidctrl vmd_pid;		/* Pageout controller. */
 	boolean_t vmd_oom;
 	int vmd_oom_seq;
 	int vmd_last_active_scan;
 	struct vm_page vmd_markers[PQ_COUNT]; /* (q) markers for queue scans */
 	struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
 	struct vm_page vmd_clock[2]; /* markers for active queue scan */
 
 	int vmd_pageout_wanted;		/* (a, p) pageout daemon wait channel */
 	int vmd_pageout_pages_needed;	/* (d) page daemon waiting for pages? */
 	bool vmd_minset;		/* (d) Are we in vm_min_domains? */
 	bool vmd_severeset;		/* (d) Are we in vm_severe_domains? */
 	enum {
 		VM_LAUNDRY_IDLE = 0,
 		VM_LAUNDRY_BACKGROUND,
 		VM_LAUNDRY_SHORTFALL
 	} vmd_laundry_request;
 
 	/* Paging thresholds and targets. */
 	u_int vmd_clean_pages_freed;	/* (q) accumulator for laundry thread */
 	u_int vmd_background_launder_target; /* (c) */
 	u_int vmd_free_reserved;	/* (c) pages reserved for deadlock */
 	u_int vmd_free_target;		/* (c) pages desired free */
 	u_int vmd_free_min;		/* (c) pages desired free */
 	u_int vmd_inactive_target;	/* (c) pages desired inactive */
 	u_int vmd_pageout_free_min;	/* (c) min pages reserved for kernel */
 	u_int vmd_pageout_wakeup_thresh;/* (c) min pages to wake pagedaemon */
 	u_int vmd_interrupt_free_min;	/* (c) reserved pages for int code */
 	u_int vmd_free_severe;		/* (c) severe page depletion point */
 
 	/* Name for sysctl etc. */
 	struct sysctl_oid *vmd_oid;
 	char vmd_name[sizeof(__XSTRING(MAXMEMDOM))];
 } __aligned(CACHE_LINE_SIZE);
 
 extern struct vm_domain vm_dom[MAXMEMDOM];
 
 #define	VM_DOMAIN(n)	(&vm_dom[(n)])
 
 #define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
 #define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)
 #define	vm_pagequeue_lockptr(pq)	(&(pq)->pq_mutex)
 #define	vm_pagequeue_trylock(pq)	mtx_trylock(&(pq)->pq_mutex)
 #define	vm_pagequeue_unlock(pq)		mtx_unlock(&(pq)->pq_mutex)
 
 #define	vm_domain_free_assert_locked(n)					\
 	    mtx_assert(vm_domain_free_lockptr((n)), MA_OWNED)
 #define	vm_domain_free_assert_unlocked(n)				\
 	    mtx_assert(vm_domain_free_lockptr((n)), MA_NOTOWNED)
 #define	vm_domain_free_lock(d)						\
 	    mtx_lock(vm_domain_free_lockptr((d)))
 #define	vm_domain_free_lockptr(d)					\
 	    (&(d)->vmd_free_mtx)
 #define	vm_domain_free_trylock(d)					\
 	    mtx_trylock(vm_domain_free_lockptr((d)))
 #define	vm_domain_free_unlock(d)					\
 	    mtx_unlock(vm_domain_free_lockptr((d)))
 
 #define	vm_domain_pageout_lockptr(d)					\
 	    (&(d)->vmd_pageout_mtx)
 #define	vm_domain_pageout_assert_locked(n)				\
 	    mtx_assert(vm_domain_pageout_lockptr((n)), MA_OWNED)
 #define	vm_domain_pageout_assert_unlocked(n)				\
 	    mtx_assert(vm_domain_pageout_lockptr((n)), MA_NOTOWNED)
 #define	vm_domain_pageout_lock(d)					\
 	    mtx_lock(vm_domain_pageout_lockptr((d)))
 #define	vm_domain_pageout_unlock(d)					\
 	    mtx_unlock(vm_domain_pageout_lockptr((d)))
 
 static __inline void
 vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
 {
 
 	vm_pagequeue_assert_locked(pq);
 	pq->pq_cnt += addend;
 }
 #define	vm_pagequeue_cnt_inc(pq)	vm_pagequeue_cnt_add((pq), 1)
 #define	vm_pagequeue_cnt_dec(pq)	vm_pagequeue_cnt_add((pq), -1)
 
 static inline void
 vm_batchqueue_init(struct vm_batchqueue *bq)
 {
 
 	bq->bq_cnt = 0;
 }
 
 static inline bool
 vm_batchqueue_insert(struct vm_batchqueue *bq, vm_page_t m)
 {
 
 	if (bq->bq_cnt < nitems(bq->bq_pa)) {
 		bq->bq_pa[bq->bq_cnt++] = m;
 		return (true);
 	}
 	return (false);
 }
 
 static inline vm_page_t
 vm_batchqueue_pop(struct vm_batchqueue *bq)
 {
 
 	if (bq->bq_cnt == 0)
 		return (NULL);
 	return (bq->bq_pa[--bq->bq_cnt]);
 }
 
 void vm_domain_set(struct vm_domain *vmd);
 void vm_domain_clear(struct vm_domain *vmd);
 int vm_domain_allocate(struct vm_domain *vmd, int req, int npages);
 
 /*
  *      vm_pagequeue_domain:
  *
  *      Return the memory domain the page belongs to.
  */
 static inline struct vm_domain *
 vm_pagequeue_domain(vm_page_t m)
 {
 
 	return (VM_DOMAIN(vm_phys_domain(m)));
 }
 
 /*
  * Return the number of pages we need to free-up or cache
  * A positive number indicates that we do not have enough free pages.
  */
 static inline int
 vm_paging_target(struct vm_domain *vmd)
 {
 
 	return (vmd->vmd_free_target - vmd->vmd_free_count);
 }
 
 /*
  * Returns TRUE if the pagedaemon needs to be woken up.
  */
 static inline int
 vm_paging_needed(struct vm_domain *vmd, u_int free_count)
 {
 
 	return (free_count < vmd->vmd_pageout_wakeup_thresh);
 }
 
 /*
  * Returns TRUE if the domain is below the min paging target.
  */
 static inline int
 vm_paging_min(struct vm_domain *vmd)
 {
 
         return (vmd->vmd_free_min > vmd->vmd_free_count);
 }
 
 /*
  * Returns TRUE if the domain is below the severe paging target.
  */
 static inline int
 vm_paging_severe(struct vm_domain *vmd)
 {
 
         return (vmd->vmd_free_severe > vmd->vmd_free_count);
 }
 
 /*
  * Return the number of pages we need to launder.
  * A positive number indicates that we have a shortfall of clean pages.
  */
 static inline int
 vm_laundry_target(struct vm_domain *vmd)
 {
 
 	return (vm_paging_target(vmd));
 }
 
 void pagedaemon_wakeup(int domain);
 
 static inline void
 vm_domain_freecnt_inc(struct vm_domain *vmd, int adj)
 {
 	u_int old, new;
 
 	old = atomic_fetchadd_int(&vmd->vmd_free_count, adj);
 	new = old + adj;
 	/*
 	 * Only update bitsets on transitions.  Notice we short-circuit the
 	 * rest of the checks if we're above min already.
 	 */
 	if (old < vmd->vmd_free_min && (new >= vmd->vmd_free_min ||
 	    (old < vmd->vmd_free_severe && new >= vmd->vmd_free_severe) ||
 	    (old < vmd->vmd_pageout_free_min &&
 	    new >= vmd->vmd_pageout_free_min)))
 		vm_domain_clear(vmd);
 }
 
 #endif	/* _KERNEL */
 #endif				/* !_VM_PAGEQUEUE_ */