diff --git a/lib/libc/gen/sysctl.3 b/lib/libc/gen/sysctl.3
index c4883e0aeb4d..4f805d45790a 100644
--- a/lib/libc/gen/sysctl.3
+++ b/lib/libc/gen/sysctl.3
@@ -1,845 +1,836 @@
 .\" Copyright (c) 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"	@(#)sysctl.3	8.4 (Berkeley) 5/9/95
 .\" $FreeBSD$
 .\"
-.Dd October 18, 2021
+.Dd March 16, 2023
 .Dt SYSCTL 3
 .Os
 .Sh NAME
 .Nm sysctl ,
 .Nm sysctlbyname ,
 .Nm sysctlnametomib
 .Nd get or set system information
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In sys/sysctl.h
 .Ft int
 .Fn sysctl "const int *name" "u_int namelen" "void *oldp" "size_t *oldlenp" "const void *newp" "size_t newlen"
 .Ft int
 .Fn sysctlbyname "const char *name" "void *oldp" "size_t *oldlenp" "const void *newp" "size_t newlen"
 .Ft int
 .Fn sysctlnametomib "const char *name" "int *mibp" "size_t *sizep"
 .Sh DESCRIPTION
 The
 .Fn sysctl
 function retrieves system information and allows processes with
 appropriate privileges to set system information.
 The information available from
 .Fn sysctl
 consists of integers, strings, and tables.
 Information may be retrieved and set from the command interface
 using the
 .Xr sysctl 8
 utility.
 .Pp
 Unless explicitly noted below,
 .Fn sysctl
 returns a consistent snapshot of the data requested.
 Consistency is obtained by locking the destination
 buffer into memory so that the data may be copied out without blocking.
 Calls to
 .Fn sysctl
 are serialized to avoid deadlock.
 .Pp
 The state is described using a ``Management Information Base'' (MIB)
 style name, listed in
 .Fa name ,
 which is a
 .Fa namelen
 length array of integers.
 .Pp
 The
 .Fn sysctlbyname
 function accepts an ASCII representation of the name and internally
 looks up the integer name vector.
 Apart from that, it behaves the same
 as the standard
 .Fn sysctl
 function.
 .Pp
 The information is copied into the buffer specified by
 .Fa oldp .
 The size of the buffer is given by the location specified by
 .Fa oldlenp
 before the call,
 and that location gives the amount of data copied after a successful call
 and after a call that returns with the error code
 .Er ENOMEM .
 If the amount of data available is greater
 than the size of the buffer supplied,
 the call supplies as much data as fits in the buffer provided
 and returns with the error code
 .Er ENOMEM .
 If the old value is not desired,
 .Fa oldp
 and
 .Fa oldlenp
 should be set to NULL.
 .Pp
 The size of the available data can be determined by calling
 .Fn sysctl
 with the
 .Dv NULL
 argument for
 .Fa oldp .
 The size of the available data will be returned in the location pointed to by
 .Fa oldlenp .
 For some operations, the amount of space may change often.
 For these operations,
 the system attempts to round up so that the returned size is
 large enough for a call to return the data shortly thereafter.
 .Pp
 To set a new value,
 .Fa newp
 is set to point to a buffer of length
 .Fa newlen
 from which the requested value is to be taken.
 If a new value is not to be set,
 .Fa newp
 should be set to NULL and
 .Fa newlen
 set to 0.
 .Pp
 The
 .Fn sysctlnametomib
 function accepts an ASCII representation of the name,
 looks up the integer name vector,
 and returns the numeric representation in the mib array pointed to by
 .Fa mibp .
 The number of elements in the mib array is given by the location specified by
 .Fa sizep
 before the call,
 and that location gives the number of entries copied after a successful call.
 The resulting
 .Fa mib
 and
 .Fa size
 may be used in subsequent
 .Fn sysctl
 calls to get the data associated with the requested ASCII name.
 This interface is intended for use by applications that want to
 repeatedly request the same variable (the
 .Fn sysctl
 function runs in about a third the time as the same request made via the
 .Fn sysctlbyname
 function).
 The
 .Fn sysctlnametomib
 function is also useful for fetching mib prefixes and then adding
 a final component.
 For example, to fetch process information
 for processes with pid's less than 100:
 .Pp
 .Bd -literal -offset indent -compact
 int i, mib[4];
 size_t len;
 struct kinfo_proc kp;
 
 /* Fill out the first three components of the mib */
 len = 4;
 sysctlnametomib("kern.proc.pid", mib, &len);
 
 /* Fetch and print entries for pid's < 100 */
 for (i = 0; i < 100; i++) {
 	mib[3] = i;
 	len = sizeof(kp);
 	if (sysctl(mib, 4, &kp, &len, NULL, 0) == -1)
 		perror("sysctl");
 	else if (len > 0)
 		printkproc(&kp);
 }
 .Ed
 .Pp
 The top level names are defined with a CTL_ prefix in
 .In sys/sysctl.h ,
 and are as follows.
 The next and subsequent levels down are found in the include files
 listed here, and described in separate sections below.
 .Bl -column CTLXMACHDEPXXX "Next Level NamesXXXXXX" -offset indent
 .It Sy Name Ta Sy Next Level Names Ta Sy Description
 .It Dv CTL_DEBUG Ta In sys/sysctl.h Ta Debugging
 .It Dv CTL_VFS Ta In sys/mount.h Ta File system
 .It Dv CTL_HW Ta In sys/sysctl.h Ta Generic CPU, I/O
 .It Dv CTL_KERN Ta In sys/sysctl.h Ta High kernel limits
 .It Dv CTL_MACHDEP Ta In sys/sysctl.h Ta Machine dependent
 .It Dv CTL_NET Ta In sys/socket.h Ta Networking
 .It Dv CTL_USER Ta In sys/sysctl.h Ta User-level
 .It Dv CTL_VM Ta In vm/vm_param.h Ta Virtual memory
 .El
 .Pp
 For example, the following retrieves the maximum number of processes allowed
 in the system:
 .Pp
 .Bd -literal -offset indent -compact
 int mib[2], maxproc;
 size_t len;
 
 mib[0] = CTL_KERN;
 mib[1] = KERN_MAXPROC;
 len = sizeof(maxproc);
 sysctl(mib, 2, &maxproc, &len, NULL, 0);
 .Ed
 .Pp
 To retrieve the standard search path for the system utilities:
 .Pp
 .Bd -literal -offset indent -compact
 int mib[2];
 size_t len;
 char *p;
 
 mib[0] = CTL_USER;
 mib[1] = USER_CS_PATH;
 sysctl(mib, 2, NULL, &len, NULL, 0);
 p = malloc(len);
 sysctl(mib, 2, p, &len, NULL, 0);
 .Ed
 .Ss CTL_DEBUG
 The debugging variables vary from system to system.
 A debugging variable may be added or deleted without need to recompile
 .Fn sysctl
 to know about it.
 Each time it runs,
 .Fn sysctl
 gets the list of debugging variables from the kernel and
 displays their current values.
 The system defines twenty
 .Pq Vt "struct ctldebug"
 variables named
 .Va debug0
 through
 .Va debug19 .
 They are declared as separate variables so that they can be
 individually initialized at the location of their associated variable.
 The loader prevents multiple use of the same variable by issuing errors
 if a variable is initialized in more than one place.
 For example, to export the variable
 .Va dospecialcheck
 as a debugging variable, the following declaration would be used:
 .Pp
 .Bd -literal -offset indent -compact
 int dospecialcheck = 1;
 struct ctldebug debug5 = { "dospecialcheck", &dospecialcheck };
 .Ed
 .Ss CTL_VFS
 A distinguished second level name, VFS_GENERIC,
 is used to get general information about all file systems.
 One of its third level identifiers is VFS_MAXTYPENUM
 that gives the highest valid file system type number.
 Its other third level identifier is VFS_CONF that
 returns configuration information about the file system
 type given as a fourth level identifier (see
 .Xr getvfsbyname 3
 as an example of its use).
 The remaining second level identifiers are the
 file system type number returned by a
 .Xr statfs 2
 call or from VFS_CONF.
 The third level identifiers available for each file system
 are given in the header file that defines the mount
 argument structure for that file system.
 .Ss CTL_HW
 The string and integer information available for the CTL_HW level
 is detailed below.
 The changeable column shows whether a process with appropriate
 privilege may change the value.
 .Bl -column "Second Level Name" integerXXX Changeable -offset indent
 .It Sy Second Level Name Ta Sy Type Ta Sy Changeable
 .It Dv HW_MACHINE Ta string Ta no
 .It Dv HW_MODEL Ta string Ta no
 .It Dv HW_NCPU Ta integer Ta no
 .It Dv HW_BYTEORDER Ta integer Ta no
 .It Dv HW_PHYSMEM Ta integer Ta no
 .It Dv HW_USERMEM Ta integer Ta no
 .It Dv HW_PAGESIZE Ta integer Ta no
 .\".It Dv HW_DISKNAMES Ta integer Ta no
 .\".It Dv HW_DISKSTATS Ta integer Ta no
 .It Dv HW_FLOATINGPT Ta integer Ta no
 .It Dv HW_MACHINE_ARCH Ta string Ta no
 .It Dv HW_REALMEM Ta integer Ta no
 .It Dv HW_AVAILPAGES Ta integer Ta no
 .El
 .Bl -tag -width 6n
 .It Li HW_MACHINE
 The machine class.
 .It Li HW_MODEL
 The machine model
 .It Li HW_NCPU
 The number of cpus.
 .It Li HW_BYTEORDER
 The byteorder (4321 or 1234).
 .It Li HW_PHYSMEM
 Amount of physical memory (in bytes), minus the amount used by the kernel,
 pre-loaded modules, and (on x86) the dcons buffer.
 .It Li HW_USERMEM
 Amount of memory (in bytes) which is not wired.
 .It Li HW_PAGESIZE
 The software page size.
 .\".It Fa HW_DISKNAMES
 .\".It Fa HW_DISKSTATS
 .It Li HW_FLOATINGPT
 Nonzero if the floating point support is in hardware.
 .It Li HW_MACHINE_ARCH
 The machine dependent architecture type.
 .It Li HW_REALMEM
 Amount of memory (in bytes) reported by the firmware.
 That value is sometimes not sane; in that case, the kernel reports the max
 memory address instead.
 .It Li HW_AVAILPAGES
 The same value as
 .Li HW_PHYSMEM ,
 measured in pages rather than bytes.
 .El
 .Ss CTL_KERN
 The string and integer information available for the CTL_KERN level
 is detailed below.
 The changeable column shows whether a process with appropriate
 privilege may change the value.
 The types of data currently available are process information,
 system vnodes, the open file entries, routing table entries,
 virtual memory statistics, load average history, and clock rate
 information.
 .Bl -column "KERNXMAXFILESPERPROCXXX" "struct clockrateXXX" -offset indent
 .It Sy Second Level Name Ta Sy Type Ta Sy Changeable
 .It Dv KERN_ARGMAX Ta integer Ta no
 .It Dv KERN_BOOTFILE Ta string Ta yes
 .It Dv KERN_BOOTTIME Ta struct timeval Ta no
 .It Dv KERN_CLOCKRATE Ta struct clockinfo Ta no
 .It Dv KERN_FILE Ta struct xfile Ta no
 .It Dv KERN_HOSTID Ta integer Ta yes
 .It Dv KERN_HOSTUUID Ta string Ta yes
 .It Dv KERN_HOSTNAME Ta string Ta yes
 .It Dv KERN_JOB_CONTROL Ta integer Ta no
 .It Dv KERN_MAXFILES Ta integer Ta yes
 .It Dv KERN_MAXFILESPERPROC Ta integer Ta yes
 .It Dv KERN_MAXPROC Ta integer Ta no
 .It Dv KERN_MAXPROCPERUID Ta integer Ta yes
 .It Dv KERN_MAXVNODES Ta integer Ta yes
 .It Dv KERN_NGROUPS Ta integer Ta no
 .It Dv KERN_NISDOMAINNAME Ta string Ta yes
 .It Dv KERN_OSRELDATE Ta integer Ta no
 .It Dv KERN_OSRELEASE Ta string Ta no
 .It Dv KERN_OSREV Ta integer Ta no
 .It Dv KERN_OSTYPE Ta string Ta no
 .It Dv KERN_POSIX1 Ta integer Ta no
 .It Dv KERN_PROC Ta node Ta not applicable
 .It Dv KERN_QUANTUM Ta integer Ta yes
 .It Dv KERN_SAVED_IDS Ta integer Ta no
 .It Dv KERN_SECURELVL Ta integer Ta raise only
 .It Dv KERN_UPDATEINTERVAL Ta integer Ta no
 .It Dv KERN_VERSION Ta string Ta no
-.It Dv KERN_VNODE Ta struct xvnode Ta no
 .El
 .Bl -tag -width 6n
 .It Li KERN_ARGMAX
 The maximum bytes of argument to
 .Xr execve 2 .
 .It Li KERN_BOOTFILE
 The full pathname of the file from which the kernel was loaded.
 .It Li KERN_BOOTTIME
 A
 .Va struct timeval
 structure is returned.
 This structure contains the time that the system was booted.
 .It Li KERN_CLOCKRATE
 A
 .Va struct clockinfo
 structure is returned.
 This structure contains the clock, statistics clock and profiling clock
 frequencies, the number of micro-seconds per hz tick and the skew rate.
 .It Li KERN_FILE
 Return the entire file table.
 The returned data consists of an array of
 .Va struct xfile ,
 whose size depends on the current number of such objects in the system.
 .It Li KERN_HOSTID
 Get or set the host ID.
 .It Li KERN_HOSTUUID
 Get or set the host's universally unique identifier (UUID).
 .It Li KERN_HOSTNAME
 Get or set the hostname.
 .It Li KERN_JOB_CONTROL
 Return 1 if job control is available on this system, otherwise 0.
 .It Li KERN_MAXFILES
 The maximum number of files that may be open in the system.
 .It Li KERN_MAXFILESPERPROC
 The maximum number of files that may be open for a single process.
 This limit only applies to processes with an effective uid of nonzero
 at the time of the open request.
 Files that have already been opened are not affected if the limit
 or the effective uid is changed.
 .It Li KERN_MAXPROC
 The maximum number of concurrent processes the system will allow.
 .It Li KERN_MAXPROCPERUID
 The maximum number of concurrent processes the system will allow
 for a single effective uid.
 This limit only applies to processes with an effective uid of nonzero
 at the time of a fork request.
 Processes that have already been started are not affected if the limit
 is changed.
 .It Li KERN_MAXVNODES
 The maximum number of vnodes available on the system.
 .It Li KERN_NGROUPS
 The maximum number of supplemental groups.
 .It Li KERN_NISDOMAINNAME
 The name of the current YP/NIS domain.
 .It Li KERN_OSRELDATE
 The kernel release version in the format
 .Ar M Ns Ar mm Ns Ar R Ns Ar xx ,
 where
 .Ar M
 is the major version,
 .Ar mm
 is the two digit minor version,
 .Ar R
 is 0 if release branch, otherwise 1,
 and
 .Ar xx
 is updated when the available APIs change.
 .Pp
 The userland release version is available from
 .In osreldate.h ;
 parse this file if you need to get the release version of
 the currently installed userland.
 .It Li KERN_OSRELEASE
 The system release string.
 .It Li KERN_OSREV
 The system revision string.
 .It Li KERN_OSTYPE
 The system type string.
 .It Li KERN_POSIX1
 The version of
 .St -p1003.1
 with which the system
 attempts to comply.
 .It Li KERN_PROC
 Return selected information about specific running processes.
 .Pp
 For the following names, an array of
 .Va struct kinfo_proc
 structures is returned,
 whose size depends on the current number of such objects in the system.
 .Bl -column "Third Level NameXXXXXX" "Fourth LevelXXXXXX" -offset indent
 .It Sy Third Level Name Ta Sy Fourth Level
 .It Dv KERN_PROC_ALL Ta None
 .It Dv KERN_PROC_PID Ta A process ID
 .It Dv KERN_PROC_PGRP Ta A process group
 .It Dv KERN_PROC_TTY Ta A tty device
 .It Dv KERN_PROC_UID Ta A user ID
 .It Dv KERN_PROC_RUID Ta A real user ID
 .El
 .Pp
 If the third level name is
 .Dv KERN_PROC_ARGS
 then the command line argument
 array is returned in a flattened form, i.e., zero-terminated arguments
 follow each other.
 The total size of array is returned.
 It is also possible for a process to set its own process title this way.
 If the third level name is
 .Dv KERN_PROC_PATHNAME ,
 the path of the
 process' text file is stored.
 For
 .Dv KERN_PROC_PATHNAME ,
 a process ID of
 .Li \-1
 implies the current process.
 .Bl -column "Third Level NameXXXXXX" "Fourth LevelXXXXXX" -offset indent
 .It Sy Third Level Name Ta Sy Fourth Level
 .It Dv KERN_PROC_ARGS Ta "A process ID"
 .It Dv KERN_PROC_PATHNAME Ta "A process ID"
 .El
 .It Li KERN_QUANTUM
 The maximum period of time, in microseconds, for which a process is allowed
 to run without being preempted if other processes are in the run queue.
 .It Li KERN_SAVED_IDS
 Returns 1 if saved set-group and saved set-user ID is available.
 .It Li KERN_SECURELVL
 The system security level.
 This level may be raised by processes with appropriate privilege.
 It may not be lowered.
 .It Li KERN_VERSION
 The system version string.
-.It Li KERN_VNODE
-Return the entire vnode table.
-Note, the vnode table is not necessarily a consistent snapshot of
-the system.
-The returned data consists of an array whose size depends on the
-current number of such objects in the system.
-Each element of the array consists of a
-.Va struct xvnode .
 .El
 .Ss CTL_NET
 The string and integer information available for the CTL_NET level
 is detailed below.
 The changeable column shows whether a process with appropriate
 privilege may change the value.
 .Bl -column "Second Level NameXXXXXX" "routing messagesXXX" -offset indent
 .It Sy Second Level Name Ta Sy Type Ta Sy Changeable
 .It Dv PF_ROUTE Ta routing messages Ta no
 .It Dv PF_INET Ta IPv4 values Ta yes
 .It Dv PF_INET6 Ta IPv6 values Ta yes
 .El
 .Bl -tag -width 6n
 .It Li PF_ROUTE
 Return the entire routing table or a subset of it.
 The data is returned as a sequence of routing messages (see
 .Xr route 4
 for the header file, format and meaning).
 The length of each message is contained in the message header.
 .Pp
 The third level name is a protocol number, which is currently always 0.
 The fourth level name is an address family, which may be set to 0 to
 select all address families.
 The fifth, sixth, and seventh level names are as follows:
 .Bl -column -offset indent "Fifth Level" "Sixth Level" "Seventh Level"
 .It Sy Fifth level Ta Sy Sixth Level Ta Sy Seventh Level
 .It Dv NET_RT_FLAGS Ta rtflags Ta None
 .It Dv NET_RT_DUMP Ta None Ta None or fib number
 .It Dv NET_RT_IFLIST Ta 0 or if_index Ta None
 .It Dv NET_RT_IFMALIST Ta 0 or if_index Ta None
 .It Dv NET_RT_IFLISTL Ta 0 or if_index Ta None
 .It Dv NET_RT_NHOPS Ta None Ta fib number
 .El
 .Pp
 The
 .Dv NET_RT_IFMALIST
 name returns information about multicast group memberships on all interfaces
 if 0 is specified, or for the interface specified by
 .Va if_index .
 .Pp
 The
 .Dv NET_RT_IFLISTL
 is like
 .Dv NET_RT_IFLIST ,
 just returning message header structs with additional fields allowing the
 interface to be extended without breaking binary compatibility.
 The
 .Dv NET_RT_IFLISTL
 uses 'l' versions of the message header structures:
 .Va struct if_msghdrl
 and
 .Va struct ifa_msghdrl .
 .Pp
 .Dv NET_RT_NHOPS
 returns all nexthops for specified address family in given fib.
 .It Li PF_INET
 Get or set various global information about the IPv4
 (Internet Protocol version 4).
 The third level name is the protocol.
 The fourth level name is the variable name.
 The currently defined protocols and names are:
 .Bl -column ProtocolXX VariableXX TypeXX ChangeableXX
 .It Sy Protocol Ta Sy Variable Ta Sy Type Ta Sy Changeable
 .It icmp Ta bmcastecho Ta integer Ta yes
 .It icmp Ta maskrepl Ta integer Ta yes
 .It ip Ta forwarding Ta integer Ta yes
 .It ip Ta redirect Ta integer Ta yes
 .It ip Ta ttl Ta integer Ta yes
 .It udp Ta checksum Ta integer Ta yes
 .El
 .Pp
 The variables are as follows:
 .Bl -tag -width 6n
 .It Li icmp.bmcastecho
 Returns 1 if an ICMP echo request to a broadcast or multicast address is
 to be answered.
 .It Li icmp.maskrepl
 Returns 1 if ICMP network mask requests are to be answered.
 .It Li ip.forwarding
 Returns 1 when IP forwarding is enabled for the host,
 meaning that the host is acting as a router.
 .It Li ip.redirect
 Returns 1 when ICMP redirects may be sent by the host.
 This option is ignored unless the host is routing IP packets,
 and should normally be enabled on all systems.
 .It Li ip.ttl
 The maximum time-to-live (hop count) value for an IP packet sourced by
 the system.
 This value applies to normal transport protocols, not to ICMP.
 .It Li udp.checksum
 Returns 1 when UDP checksums are being computed and checked.
 Disabling UDP checksums is strongly discouraged.
 .Pp
 For variables net.inet.*.ipsec, please refer to
 .Xr ipsec 4 .
 .El
 .It Li PF_INET6
 Get or set various global information about the IPv6
 (Internet Protocol version 6).
 The third level name is the protocol.
 The fourth level name is the variable name.
 .Pp
 For variables net.inet6.* please refer to
 .Xr inet6 4 .
 For variables net.inet6.*.ipsec6, please refer to
 .Xr ipsec 4 .
 .El
 .Ss CTL_USER
 The string and integer information available for the CTL_USER level
 is detailed below.
 The changeable column shows whether a process with appropriate
 privilege may change the value.
 .Bl -column "USER_COLL_WEIGHTS_MAXXXX" "integerXXX" -offset indent
 .It Sy Second Level Name Ta Sy Type Ta Sy Changeable
 .It Dv USER_BC_BASE_MAX Ta integer Ta no
 .It Dv USER_BC_DIM_MAX Ta integer Ta no
 .It Dv USER_BC_SCALE_MAX Ta integer Ta no
 .It Dv USER_BC_STRING_MAX Ta integer Ta no
 .It Dv USER_COLL_WEIGHTS_MAX Ta integer Ta no
 .It Dv USER_CS_PATH Ta string Ta no
 .It Dv USER_EXPR_NEST_MAX Ta integer Ta no
 .It Dv USER_LINE_MAX Ta integer Ta no
 .It Dv USER_LOCALBASE Ta string Ta no
 .It Dv USER_POSIX2_CHAR_TERM Ta integer Ta no
 .It Dv USER_POSIX2_C_BIND Ta integer Ta no
 .It Dv USER_POSIX2_C_DEV Ta integer Ta no
 .It Dv USER_POSIX2_FORT_DEV Ta integer Ta no
 .It Dv USER_POSIX2_FORT_RUN Ta integer Ta no
 .It Dv USER_POSIX2_LOCALEDEF Ta integer Ta no
 .It Dv USER_POSIX2_SW_DEV Ta integer Ta no
 .It Dv USER_POSIX2_UPE Ta integer Ta no
 .It Dv USER_POSIX2_VERSION Ta integer Ta no
 .It Dv USER_RE_DUP_MAX Ta integer Ta no
 .It Dv USER_STREAM_MAX Ta integer Ta no
 .It Dv USER_TZNAME_MAX Ta integer Ta no
 .El
 .Bl -tag -width 6n
 .It Li USER_BC_BASE_MAX
 The maximum ibase/obase values in the
 .Xr bc 1
 utility.
 .It Li USER_BC_DIM_MAX
 The maximum array size in the
 .Xr bc 1
 utility.
 .It Li USER_BC_SCALE_MAX
 The maximum scale value in the
 .Xr bc 1
 utility.
 .It Li USER_BC_STRING_MAX
 The maximum string length in the
 .Xr bc 1
 utility.
 .It Li USER_COLL_WEIGHTS_MAX
 The maximum number of weights that can be assigned to any entry of
 the LC_COLLATE order keyword in the locale definition file.
 .It Li USER_CS_PATH
 Return a value for the
 .Ev PATH
 environment variable that finds all the standard utilities.
 .It Li USER_EXPR_NEST_MAX
 The maximum number of expressions that can be nested within
 parenthesis by the
 .Xr expr 1
 utility.
 .It Li USER_LINE_MAX
 The maximum length in bytes of a text-processing utility's input
 line.
 .It Li USER_LOCALBASE
 Return the value of localbase that has been compiled into system utilities
 that need to have access to resources provided by a port or package.
 .It Li USER_POSIX2_CHAR_TERM
 Return 1 if the system supports at least one terminal type capable of
 all operations described in
 .St -p1003.2 ,
 otherwise 0.
 .It Li USER_POSIX2_C_BIND
 Return 1 if the system's C-language development facilities support the
 C-Language Bindings Option, otherwise 0.
 .It Li USER_POSIX2_C_DEV
 Return 1 if the system supports the C-Language Development Utilities Option,
 otherwise 0.
 .It Li USER_POSIX2_FORT_DEV
 Return 1 if the system supports the FORTRAN Development Utilities Option,
 otherwise 0.
 .It Li USER_POSIX2_FORT_RUN
 Return 1 if the system supports the FORTRAN Runtime Utilities Option,
 otherwise 0.
 .It Li USER_POSIX2_LOCALEDEF
 Return 1 if the system supports the creation of locales, otherwise 0.
 .It Li USER_POSIX2_SW_DEV
 Return 1 if the system supports the Software Development Utilities Option,
 otherwise 0.
 .It Li USER_POSIX2_UPE
 Return 1 if the system supports the User Portability Utilities Option,
 otherwise 0.
 .It Li USER_POSIX2_VERSION
 The version of
 .St -p1003.2
 with which the system attempts to comply.
 .It Li USER_RE_DUP_MAX
 The maximum number of repeated occurrences of a regular expression
 permitted when using interval notation.
 .It Li USER_STREAM_MAX
 The minimum maximum number of streams that a process may have open
 at any one time.
 .It Li USER_TZNAME_MAX
 The minimum maximum number of types supported for the name of a
 timezone.
 .El
 .Ss CTL_VM
 The string and integer information available for the CTL_VM level
 is detailed below.
 The changeable column shows whether a process with appropriate
 privilege may change the value.
 .Bl -column "Second Level NameXXXXXX" "struct loadavgXXX" -offset indent
 .It Sy Second Level Name Ta Sy Type Ta Sy Changeable
 .It Dv VM_LOADAVG Ta struct loadavg Ta no
 .It Dv VM_TOTAL Ta struct vmtotal Ta no
 .It Dv VM_SWAPPING_ENABLED Ta integer Ta maybe
 .It Dv VM_V_FREE_MIN Ta integer Ta yes
 .It Dv VM_V_FREE_RESERVED Ta integer Ta yes
 .It Dv VM_V_FREE_TARGET Ta integer Ta yes
 .It Dv VM_V_INACTIVE_TARGET Ta integer Ta yes
 .It Dv VM_V_PAGEOUT_FREE_MIN Ta integer Ta yes
 .It Dv VM_OVERCOMMIT Ta integer Ta yes
 .El
 .Bl -tag -width 6n
 .It Li VM_LOADAVG
 Return the load average history.
 The returned data consists of a
 .Va struct loadavg .
 .It Li VM_TOTAL
 Return the system wide virtual memory statistics.
 The returned data consists of a
 .Va struct vmtotal .
 .It Li VM_SWAPPING_ENABLED
 1 if process swapping is enabled or 0 if disabled.
 This variable is
 permanently set to 0 if the kernel was built with swapping disabled.
 .It Li VM_V_FREE_MIN
 Minimum amount of memory (cache memory plus free memory)
 required to be available before a process waiting on memory will be
 awakened.
 .It Li VM_V_FREE_RESERVED
 Processes will awaken the pageout daemon and wait for memory if the
 number of free and cached pages drops below this value.
 .It Li VM_V_FREE_TARGET
 The total amount of free memory (including cache memory) that the
 pageout daemon tries to maintain.
 .It Li VM_V_INACTIVE_TARGET
 The desired number of inactive pages that the pageout daemon should
 achieve when it runs.
 Inactive pages can be quickly inserted into
 process address space when needed.
 .It Li VM_V_PAGEOUT_FREE_MIN
 If the amount of free and cache memory falls below this value, the
 pageout daemon will enter "memory conserving mode" to avoid deadlock.
 .It Li VM_OVERCOMMIT
 Overcommit behaviour, as described in
 .Xr tuning 7 .
 .El
 .Sh RETURN VALUES
 .Rv -std
 .Sh FILES
 .Bl -tag -width <netinet/icmpXvar.h> -compact
 .It In sys/sysctl.h
 definitions for top level identifiers, second level kernel and hardware
 identifiers, and user level identifiers
 .It In sys/socket.h
 definitions for second level network identifiers
 .It In sys/gmon.h
 definitions for third level profiling identifiers
 .It In vm/vm_param.h
 definitions for second level virtual memory identifiers
 .It In netinet/in.h
 definitions for third level IPv4/IPv6 identifiers and
 fourth level IPv4/v6 identifiers
 .It In netinet/icmp_var.h
 definitions for fourth level ICMP identifiers
 .It In netinet/icmp6.h
 definitions for fourth level ICMPv6 identifiers
 .It In netinet/udp_var.h
 definitions for fourth level UDP identifiers
 .El
 .Sh ERRORS
 The following errors may be reported:
 .Bl -tag -width Er
 .It Bq Er EFAULT
 The buffer
 .Fa name ,
 .Fa oldp ,
 .Fa newp ,
 or length pointer
 .Fa oldlenp
 contains an invalid address.
 .It Bq Er EINVAL
 The
 .Fa name
 array is less than two or greater than CTL_MAXNAME.
 .It Bq Er EINVAL
 A non-null
 .Fa newp
 is given and its specified length in
 .Fa newlen
 is too large or too small.
 .It Bq Er ENOMEM
 The length pointed to by
 .Fa oldlenp
 is too short to hold the requested value.
 .It Bq Er ENOMEM
 The smaller of either the length pointed to by
 .Fa oldlenp
 or the estimated size of the returned data exceeds the
 system limit on locked memory.
 .It Bq Er ENOMEM
 Locking the buffer
 .Fa oldp ,
 or a portion of the buffer if the estimated size of the data
 to be returned is smaller,
 would cause the process to exceed its per-process locked memory limit.
 .It Bq Er ENOTDIR
 The
 .Fa name
 array specifies an intermediate rather than terminal name.
 .It Bq Er EISDIR
 The
 .Fa name
 array specifies a terminal name, but the actual name is not terminal.
 .It Bq Er ENOENT
 The
 .Fa name
 array specifies a value that is unknown.
 .It Bq Er EPERM
 An attempt is made to set a read-only value.
 .It Bq Er EPERM
 A process without appropriate privilege attempts to set a value.
 .El
 .Sh SEE ALSO
 .Xr confstr 3 ,
 .Xr kvm 3 ,
 .Xr sysconf 3 ,
 .Xr sysctl 8
 .Sh HISTORY
 The
 .Fn sysctl
 function first appeared in
 .Bx 4.4 .
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
index f95246f1cf38..b0026777a389 100644
--- a/sys/kern/kern_xxx.c
+++ b/sys/kern/kern_xxx.c
@@ -1,430 +1,423 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_xxx.c	8.2 (Berkeley) 11/14/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/utsname.h>
 
 #include <vm/vm_param.h>
 
 #if defined(COMPAT_43)
 
 int
 ogethostname(struct thread *td, struct ogethostname_args *uap)
 {
 	int name[2];
 	size_t len = uap->len;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	return (userland_sysctl(td, name, 2, uap->hostname, &len,
 	    1, 0, 0, 0, 0));
 }
 
 int
 osethostname(struct thread *td, struct osethostname_args *uap)
 {
 	int name[2];
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, uap->hostname,
 	    uap->len, 0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogethostid_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 ogethostid(struct thread *td, struct ogethostid_args *uap)
 {
 	size_t len = sizeof(long);
 	int name[2];
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTID;
 	return (kernel_sysctl(td, name, 2, (long *)td->td_retval, &len,
 	    NULL, 0, NULL, 0));
 }
 
 int
 osethostid(struct thread *td, struct osethostid_args *uap)
 {
 	int name[2];
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTID;
 	return (kernel_sysctl(td, name, 2, NULL, NULL, &uap->hostid,
 	    sizeof(uap->hostid), NULL, 0));
 }
 
 int
 oquota(struct thread *td, struct oquota_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #define	KINFO_PROC		(0<<8)
 #define	KINFO_RT		(1<<8)
-#define	KINFO_VNODE		(2<<8)
+/* UNUSED, was KINFO_VNODE (2<<8) */
 #define	KINFO_FILE		(3<<8)
 #define	KINFO_METER		(4<<8)
 #define	KINFO_LOADAVG		(5<<8)
 #define	KINFO_CLOCKRATE		(6<<8)
 
 /* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
 #define	KINFO_BSDI_SYSINFO	(101<<8)
 
 /*
  * XXX this is bloat, but I hope it's better here than on the potentially
  * limited kernel stack...  -Peter
  */
 
 static struct {
 	int	bsdi_machine;		/* "i386" on BSD/386 */
 /*      ^^^ this is an offset to the string, relative to the struct start */
 	char	*pad0;
 	long	pad1;
 	long	pad2;
 	long	pad3;
 	u_long	pad4;
 	u_long	pad5;
 	u_long	pad6;
 
 	int	bsdi_ostype;		/* "BSD/386" on BSD/386 */
 	int	bsdi_osrelease;		/* "1.1" on BSD/386 */
 	long	pad7;
 	long	pad8;
 	char	*pad9;
 
 	long	pad10;
 	long	pad11;
 	int	pad12;
 	long	pad13;
 	quad_t	pad14;
 	long	pad15;
 
 	struct	timeval pad16;
 	/* we dont set this, because BSDI's uname used gethostname() instead */
 	int	bsdi_hostname;		/* hostname on BSD/386 */
 
 	/* the actual string data is appended here */
 
 } bsdi_si;
 
 /*
  * this data is appended to the end of the bsdi_si structure during copyout.
  * The "char *" offsets are relative to the base of the bsdi_si struct.
  * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
  * should not exceed the length of the buffer here... (or else!! :-)
  */
 static char bsdi_strings[80];	/* It had better be less than this! */
 
 int
 ogetkerninfo(struct thread *td, struct ogetkerninfo_args *uap)
 {
 	int error, name[6];
 	size_t size;
 	u_int needed = 0;
 
 	switch (uap->op & 0xff00) {
 	case KINFO_RT:
 		name[0] = CTL_NET;
 		name[1] = PF_ROUTE;
 		name[2] = 0;
 		name[3] = (uap->op & 0xff0000) >> 16;
 		name[4] = uap->op & 0xff;
 		name[5] = uap->arg;
 		error = userland_sysctl(td, name, 6, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
-	case KINFO_VNODE:
-		name[0] = CTL_KERN;
-		name[1] = KERN_VNODE;
-		error = userland_sysctl(td, name, 2, uap->where, uap->size,
-			0, 0, 0, &size, 0);
-		break;
-
 	case KINFO_PROC:
 		name[0] = CTL_KERN;
 		name[1] = KERN_PROC;
 		name[2] = uap->op & 0xff;
 		name[3] = uap->arg;
 		error = userland_sysctl(td, name, 4, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_FILE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_FILE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_METER:
 		name[0] = CTL_VM;
 		name[1] = VM_TOTAL;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_LOADAVG:
 		name[0] = CTL_VM;
 		name[1] = VM_LOADAVG;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_CLOCKRATE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_CLOCKRATE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_BSDI_SYSINFO: {
 		/*
 		 * this is pretty crude, but it's just enough for uname()
 		 * from BSDI's 1.x libc to work.
 		 *
 		 * *size gives the size of the buffer before the call, and
 		 * the amount of data copied after a successful call.
 		 * If successful, the return value is the amount of data
 		 * available, which can be larger than *size.
 		 *
 		 * BSDI's 2.x product apparently fails with ENOMEM if *size
 		 * is too small.
 		 */
 
 		u_int left;
 		char *s;
 
 		bzero((char *)&bsdi_si, sizeof(bsdi_si));
 		bzero(bsdi_strings, sizeof(bsdi_strings));
 
 		s = bsdi_strings;
 
 		bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, ostype);
 		s += strlen(s) + 1;
 
 		bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, osrelease);
 		s += strlen(s) + 1;
 
 		bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, machine);
 		s += strlen(s) + 1;
 
 		needed = sizeof(bsdi_si) + (s - bsdi_strings);
 
 		if ((uap->where == NULL) || (uap->size == NULL)) {
 			/* process is asking how much buffer to supply.. */
 			size = needed;
 			error = 0;
 			break;
 		}
 
 		if ((error = copyin(uap->size, &size, sizeof(size))) != 0)
 			break;
 
 		/* if too much buffer supplied, trim it down */
 		if (size > needed)
 			size = needed;
 
 		/* how much of the buffer is remaining */
 		left = size;
 
 		if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
 			break;
 
 		/* is there any point in continuing? */
 		if (left > sizeof(bsdi_si)) {
 			left -= sizeof(bsdi_si);
 			error = copyout(&bsdi_strings,
 					uap->where + sizeof(bsdi_si), left);
 		}
 		break;
 	}
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	if (error == 0) {
 		td->td_retval[0] = needed ? needed : size;
 		if (uap->size) {
 			error = copyout(&size, uap->size, sizeof(size));
 		}
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * This is the FreeBSD-1.1 compatible uname(2) interface.  These days it is
  * done in libc as a wrapper around a bunch of sysctl's.  This must maintain
  * the old 1.1 binary ABI.
  */
 #if SYS_NMLN != 32
 #error "FreeBSD-1.1 uname syscall has been broken"
 #endif
 #ifndef _SYS_SYSPROTO_H_
 struct uname_args {
 	struct utsname  *name;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd4_uname(struct thread *td, struct freebsd4_uname_args *uap)
 {
 	int name[2], error;
 	size_t len;
 	char *s, *us;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_OSTYPE;
 	len = sizeof (uap->name->sysname);
 	error = userland_sysctl(td, name, 2, uap->name->sysname, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		return (error);
 	subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
 
 	name[1] = KERN_HOSTNAME;
 	len = sizeof uap->name->nodename;
 	error = userland_sysctl(td, name, 2, uap->name->nodename, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		return (error);
 	subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
 
 	name[1] = KERN_OSRELEASE;
 	len = sizeof uap->name->release;
 	error = userland_sysctl(td, name, 2, uap->name->release, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		return (error);
 	subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
 
 /*
 	name = KERN_VERSION;
 	len = sizeof uap->name->version;
 	error = userland_sysctl(td, name, 2, uap->name->version, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		return (error);
 	subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
 */
 
 /*
  * this stupid hackery to make the version field look like FreeBSD 1.1
  */
 	for(s = version; *s && *s != '#'; s++);
 
 	for(us = uap->name->version; *s && *s != ':'; s++) {
 		error = subyte( us++, *s);
 		if (error)
 			return (error);
 	}
 	error = subyte( us++, 0);
 	if (error)
 		return (error);
 
 	name[0] = CTL_HW;
 	name[1] = HW_MACHINE;
 	len = sizeof uap->name->machine;
 	error = userland_sysctl(td, name, 2, uap->name->machine, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		return (error);
 	subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getdomainname_args {
 	char    *domainname;
 	int     len;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd4_getdomainname(struct thread *td,
     struct freebsd4_getdomainname_args *uap)
 {
 	int name[2];
 	size_t len = uap->len;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_NISDOMAINNAME;
 	return (userland_sysctl(td, name, 2, uap->domainname, &len,
 	    1, 0, 0, 0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setdomainname_args {
 	char    *domainname;
 	int     len;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd4_setdomainname(struct thread *td,
     struct freebsd4_setdomainname_args *uap)
 {
 	int name[2];
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_NISDOMAINNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, uap->domainname,
 	    uap->len, 0, 0));
 }
 #endif /* COMPAT_FREEBSD4 */
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 0b9d7ca9778f..c117ada017c9 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1,7195 +1,7098 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/asan.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smr.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <machine/stdarg.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS))
 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	v_init_counters(struct vnode *);
 static void	vn_seqc_init(struct vnode *);
 static void	vn_seqc_write_end_free(struct vnode *vp);
 static void	vgonel(struct vnode *);
 static bool	vhold_recycle_free(struct vnode *);
 static void	vdropl_recycle(struct vnode *vp);
 static void	vdrop_recycle(struct vnode *vp);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static void	vfs_knl_assert_lock(void *arg, int what);
 static void	destroy_vpollinfo(struct vpollinfo *vi);
 static int	v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 		    daddr_t startlbn, daddr_t endlbn);
 static void	vnlru_recalc(void);
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
  */
 static u_long __exclusive_cache_line numvnodes;
 
 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
     "Number of vnodes in existence");
 
 static counter_u64_t vnodes_created;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
     "Number of vnodes created by getnewvnode");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of allocates vnodes in the system.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_list;
 static struct vnode *vnode_list_free_marker;
 static struct vnode *vnode_list_reclaim_marker;
 
 /*
  * "Free" vnode target.  Free vnodes are rarely completely free, but are
  * just ones that are cheap to recycle.  Usually they are for files which
  * have been stat'd but not read; these usually have inode and namecache
  * data attached to them.  This target is the preferred minimum size of a
  * sub-cache consisting mostly of such files. The system balances the size
  * of this sub-cache with its complement to try to prevent either from
  * thrashing while the other is relatively inactive.  The targets express
  * a preference for the best balance.
  *
  * "Above" this target there are 2 further targets (watermarks) related
  * to recyling of free vnodes.  In the best-operating case, the cache is
  * exactly full, the free list has size between vlowat and vhiwat above the
  * free target, and recycling from it and normal use maintains this state.
  * Sometimes the free list is below vlowat or even empty, but this state
  * is even better for immediate use provided the cache is not full.
  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
  * ones) to reach one of these states.  The watermarks are currently hard-
  * coded as 4% and 9% of the available space higher.  These and the default
  * of 25% for wantfreevnodes are too large if the memory size is large.
  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
  * whenever vnlru_proc() becomes active.
  */
 static long wantfreevnodes;
 static long __exclusive_cache_line freevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
     &freevnodes, 0, "Number of \"free\" vnodes");
 static long freevnodes_old;
 
 static counter_u64_t recycles_count;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
     "Number of vnodes recycled to meet vnode cache targets");
 
 static counter_u64_t recycles_free_count;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
     "Number of free vnodes recycled to meet vnode cache targets");
 
 static counter_u64_t deferred_inact;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
     "Number of times inactive processing was deferred");
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx __exclusive_cache_line vnode_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 static uma_zone_t buf_trie_zone;
 static smr_t buf_trie_smr;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll");
 
 __read_frequently smr_t vfs_smr;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 static struct cv sync_wakeup;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
     "Time to delay syncing files (in seconds)");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
     "Time to delay syncing directories (in seconds)");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
     "Time to delay syncing metadata (in seconds)");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
     "Number of times I/O speeded up (rush requests)");
 
 #define	VDBATCH_SIZE 8
 struct vdbatch {
 	u_int index;
 	long freevnodes;
 	struct mtx lock;
 	struct vnode *tab[VDBATCH_SIZE];
 };
 DPCPU_DEFINE_STATIC(struct vdbatch, vd);
 
 static void	vdbatch_dequeue(struct vnode *vp);
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /* Target for maximum number of vnodes. */
 u_long desiredvnodes;
 static u_long gapvnodes;		/* gap between wanted and desired */
 static u_long vhiwat;		/* enough extras after expansion */
 static u_long vlowat;		/* minimal extras before expansion */
 static u_long vstir;		/* nonzero to stir non-free vnodes */
 static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
 
 static u_long vnlru_read_freevnodes(void);
 
 /*
  * Note that no attempt is made to sanitize these parameters.
  */
 static int
 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 	int error;
 
 	val = desiredvnodes;
 	error = sysctl_handle_long(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (val == desiredvnodes)
 		return (0);
 	mtx_lock(&vnode_list_mtx);
 	desiredvnodes = val;
 	wantfreevnodes = desiredvnodes / 4;
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	/*
 	 * XXX There is no protection against multiple threads changing
 	 * desiredvnodes at the same time. Locking above only helps vnlru and
 	 * getnewvnode.
 	 */
 	vfs_hash_changesize(desiredvnodes);
 	cache_changesize(desiredvnodes);
 	return (0);
 }
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
     "LU", "Target for maximum number of vnodes");
 
 static int
 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 	int error;
 
 	val = wantfreevnodes;
 	error = sysctl_handle_long(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (val == wantfreevnodes)
 		return (0);
 	mtx_lock(&vnode_list_mtx);
 	wantfreevnodes = val;
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
     "LU", "Target for minimum number of \"free\" vnodes");
 
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW | CTLFLAG_STATS,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 static int
 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	char *buf;
 	unsigned long ndflags;
 	int error;
 
 	if (req->newptr == NULL)
 		return (EINVAL);
 	if (req->newlen >= PATH_MAX)
 		return (E2BIG);
 
 	buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
 	error = SYSCTL_IN(req, buf, req->newlen);
 	if (error != 0)
 		goto out;
 
 	buf[req->newlen] = '\0';
 
 	ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1;
 	NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	vp = nd.ni_vp;
 
 	if (VN_IS_DOOMED(vp)) {
 		/*
 		 * This vnode is being recycled.  Return != 0 to let the caller
 		 * know that the sysctl had no effect.  Return EAGAIN because a
 		 * subsequent call will likely succeed (since namei will create
 		 * a new vnode if necessary)
 		 */
 		error = EAGAIN;
 		goto putvnode;
 	}
 
 	counter_u64_add(recycles_count, 1);
 	vgone(vp);
 putvnode:
 	vput(vp);
 	NDFREE_PNBUF(&nd);
 out:
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct file *fp;
 	int error;
 	int fd;
 
 	if (req->newptr == NULL)
 		return (EBADF);
 
         error = sysctl_handle_int(oidp, &fd, 0, req);
         if (error != 0)
                 return (error);
 	error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 
 	error = vn_lock(vp, LK_EXCLUSIVE);
 	if (error != 0)
 		goto drop;
 
 	counter_u64_add(recycles_count, 1);
 	vgone(vp);
 	VOP_UNLOCK(vp);
 drop:
 	fdrop(fp, td);
 	return (error);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
     CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
     sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
     sysctl_ftry_reclaim_vnode, "I",
     "Try to reclaim a vnode by its file descriptor");
 
 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 #define vnsz2log 8
 #ifndef DEBUG_LOCKS
 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log &&
     sizeof(struct vnode) < 1UL << (vnsz2log + 1),
     "vnsz2log needs to be updated");
 #endif
 
 /*
  * Support for the bufobj clean & dirty pctrie.
  */
 static void *
 buf_trie_alloc(struct pctrie *ptree)
 {
 	return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
 }
 
 static void
 buf_trie_free(struct pctrie *ptree, void *node)
 {
 	uma_zfree_smr(buf_trie_zone, node);
 }
 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
     buf_trie_smr);
 
 /*
  * Initialize the vnode management data structures.
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	(512UL * 1024 * 1024 / 64)	/* 8M */
 #endif
 
 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 
 static struct vnode *
 vn_alloc_marker(struct mount *mp)
 {
 	struct vnode *vp;
 
 	vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	vp->v_type = VMARKER;
 	vp->v_mount = mp;
 
 	return (vp);
 }
 
 static void
 vn_free_marker(struct vnode *vp)
 {
 
 	MPASS(vp->v_type == VMARKER);
 	free(vp, M_VNODE_MARKER);
 }
 
 #ifdef KASAN
 static int
 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused)
 {
 	kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0);
 	return (0);
 }
 
 static void
 vnode_dtor(void *mem, int size, void *arg __unused)
 {
 	size_t end1, end2, off1, off2;
 
 	_Static_assert(offsetof(struct vnode, v_vnodelist) <
 	    offsetof(struct vnode, v_dbatchcpu),
 	    "KASAN marks require updating");
 
 	off1 = offsetof(struct vnode, v_vnodelist);
 	off2 = offsetof(struct vnode, v_dbatchcpu);
 	end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist);
 	end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu);
 
 	/*
 	 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even
 	 * after the vnode has been freed.  Try to get some KASAN coverage by
 	 * marking everything except those two fields as invalid.  Because
 	 * KASAN's tracking is not byte-granular, any preceding fields sharing
 	 * the same 8-byte aligned word must also be marked valid.
 	 */
 
 	/* Handle the area from the start until v_vnodelist... */
 	off1 = rounddown2(off1, KASAN_SHADOW_SCALE);
 	kasan_mark(mem, off1, off1, KASAN_UMA_FREED);
 
 	/* ... then the area between v_vnodelist and v_dbatchcpu ... */
 	off1 = roundup2(end1, KASAN_SHADOW_SCALE);
 	off2 = rounddown2(off2, KASAN_SHADOW_SCALE);
 	if (off2 > off1)
 		kasan_mark((void *)((char *)mem + off1), off2 - off1,
 		    off2 - off1, KASAN_UMA_FREED);
 
 	/* ... and finally the area from v_dbatchcpu to the end. */
 	off2 = roundup2(end2, KASAN_SHADOW_SCALE);
 	kasan_mark((void *)((char *)mem + off2), size - off2, size - off2,
 	    KASAN_UMA_FREED);
 }
 #endif /* KASAN */
 
 /*
  * Initialize a vnode as it first enters the zone.
  */
 static int
 vnode_init(void *mem, int size, int flags)
 {
 	struct vnode *vp;
 
 	vp = mem;
 	bzero(vp, size);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
 	    LK_NOSHARE | LK_IS_VNODE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bufobj_init(&vp->v_bufobj, vp);
 	/*
 	 * Initialize namecache.
 	 */
 	cache_vnode_init(vp);
 	/*
 	 * Initialize rangelocks.
 	 */
 	rangelock_init(&vp->v_rl);
 
 	vp->v_dbatchcpu = NOCPU;
 
 	vp->v_state = VSTATE_DEAD;
 
 	/*
 	 * Check vhold_recycle_free for an explanation.
 	 */
 	vp->v_holdcnt = VHOLD_NO_SMR;
 	vp->v_type = VNON;
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	return (0);
 }
 
 /*
  * Free a vnode when it is cleared from the zone.
  */
 static void
 vnode_fini(void *mem, int size)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	vp = mem;
 	vdbatch_dequeue(vp);
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	bo = &vp->v_bufobj;
 	rw_destroy(BO_LOCKPTR(bo));
 
 	kasan_mark(mem, size, size, 0);
 }
 
 /*
  * Provide the size of NFS nclnode and NFS fh for calculation of the
  * vnode memory consumption.  The size is specified directly to
  * eliminate dependency on NFS-private header.
  *
  * Other filesystems may use bigger or smaller (like UFS and ZFS)
  * private inode data, but the NFS-based estimation is ample enough.
  * Still, we care about differences in the size between 64- and 32-bit
  * platforms.
  *
  * Namecache structure size is heuristically
  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
  */
 #ifdef _LP64
 #define	NFS_NCLNODE_SZ	(528 + 64)
 #define	NC_SZ		148
 #else
 #define	NFS_NCLNODE_SZ	(360 + 32)
 #define	NC_SZ		92
 #endif
 
 static void
 vntblinit(void *dummy __unused)
 {
 	struct vdbatch *vd;
 	uma_ctor ctor;
 	uma_dtor dtor;
 	int cpu, physvnodes, virtvnodes;
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
 	 * physical memory size.  The ratio of desiredvnodes to the physical
 	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
 	 * Thereafter, the
 	 * marginal ratio of desiredvnodes to the physical memory size is
 	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
 	 * must not exceed 1/10th of the kernel's heap size.
 	 */
 	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
 	    3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
 	virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
 	desiredvnodes = min(physvnodes, virtvnodes);
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %lu -> %lu\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_list);
 	mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
 	/*
 	 * The lock is taken to appease WITNESS.
 	 */
 	mtx_lock(&vnode_list_mtx);
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	vnode_list_free_marker = vn_alloc_marker(NULL);
 	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
 	vnode_list_reclaim_marker = vn_alloc_marker(NULL);
 	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
 
 #ifdef KASAN
 	ctor = vnode_ctor;
 	dtor = vnode_dtor;
 #else
 	ctor = NULL;
 	dtor = NULL;
 #endif
 	vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor,
 	    vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN);
 	uma_zone_set_smr(vnode_zone, vfs_smr);
 
 	/*
 	 * Preallocate enough nodes to support one-per buf so that
 	 * we can not fail an insert.  reassignbuf() callers can not
 	 * tolerate the insertion failure.
 	 */
 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
 	    UMA_ZONE_NOFREE | UMA_ZONE_SMR);
 	buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
 	uma_prealloc(buf_trie_zone, nbuf);
 
 	vnodes_created = counter_u64_alloc(M_WAITOK);
 	recycles_count = counter_u64_alloc(M_WAITOK);
 	recycles_free_count = counter_u64_alloc(M_WAITOK);
 	deferred_inact = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 	    &syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 	cv_init(&sync_wakeup, "syncer");
 
 	CPU_FOREACH(cpu) {
 		vd = DPCPU_ID_PTR((cpu), vd);
 		bzero(vd, sizeof(*vd));
 		mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
 	}
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Eventually, mountlist_mtx is not released on failure.
  *
  * vfs_busy() is a custom lock, it can block the caller.
  * vfs_busy() only sleeps if the unmount is active on the mount point.
  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
  * vnode belonging to mp.
  *
  * Lookup uses vfs_busy() to traverse mount points.
  * root fs			var fs
  * / vnode lock		A	/ vnode lock (/var)		D
  * /var vnode lock	B	/log vnode lock(/var/log)	E
  * vfs_busy lock	C	vfs_busy lock			F
  *
  * Within each file system, the lock order is C->A->B and F->D->E.
  *
  * When traversing across mounts, the system follows that lock order:
  *
  *        C->A->B
  *              |
  *              +->F->D->E
  *
  * The lookup() process for namei("/var") illustrates the process:
  *  1. VOP_LOOKUP() obtains B while A is held
  *  2. vfs_busy() obtains a shared lock on F while A and B are held
  *  3. vput() releases lock on B
  *  4. vput() releases lock on A
  *  5. VFS_ROOT() obtains lock on D while shared lock on F is held
  *  6. vfs_unbusy() releases shared lock on F
  *  7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
  *     Attempt to lock A (instead of vp_crossmp) while D is held would
  *     violate the global order, causing deadlocks.
  *
  * dounmount() locks B while F is drained.  Note that for stacked
  * filesystems, D and B in the example above may be the same lock,
  * which introdues potential lock order reversal deadlock between
  * dounmount() and step 5 above.  These filesystems may avoid the LOR
  * by setting VV_CROSSLOCK on the covered vnode so that lock B will
  * remain held until after step 5.
  */
 int
 vfs_busy(struct mount *mp, int flags)
 {
 	struct mount_pcpu *mpcpu;
 
 	MPASS((flags & ~MBF_MASK) == 0);
 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
 		MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
 		vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 		vfs_mp_count_add_pcpu(mpcpu, lockref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REF(mp);
 	/*
 	 * If mount point is currently being unmounted, sleep until the
 	 * mount point fate is decided.  If thread doing the unmounting fails,
 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 	 * that this mount point has survived the unmount attempt and vfs_busy
 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 	 * about to be really destroyed.  vfs_busy needs to release its
 	 * reference on the mount point in this case and return with ENOENT,
 	 * telling the caller the mount it tried to busy is no longer valid.
 	 */
 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		KASSERT(TAILQ_EMPTY(&mp->mnt_uppers),
 		    ("%s: non-empty upper mount list with pending unmount",
 		    __func__));
 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
 			    __func__);
 			return (ENOENT);
 		}
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_lock(&mountlist_mtx);
 		MNT_ILOCK(mp);
 	}
 	if (flags & MBF_MNTLSTLOCK)
 		mtx_unlock(&mountlist_mtx);
 	mp->mnt_lockref++;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int c;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		vfs_mp_count_sub_pcpu(mpcpu, lockref, 1);
 		vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REL(mp);
 	c = --mp->mnt_lockref;
 	if (mp->mnt_vfs_ops == 0) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	if (c < 0)
 		vfs_dump_mount_counters(mp);
 	if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
 		wakeup(&mp->mnt_lockref);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
 			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	return ((struct mount *) 0);
 }
 
 /*
  * Lookup a mount point by filesystem identifier, busying it before
  * returning.
  *
  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
  * cache for popular filesystem identifiers.  The cache is lockess, using
  * the fact that struct mount's are never freed.  In worst case we may
  * get pointer to unmounted or even different filesystem, so we have to
  * check what we got, and go slow way if so.
  */
 struct mount *
 vfs_busyfs(fsid_t *fsid)
 {
 #define	FSID_CACHE_SIZE	256
 	typedef struct mount * volatile vmp_t;
 	static vmp_t cache[FSID_CACHE_SIZE];
 	struct mount *mp;
 	int error;
 	uint32_t hash;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	hash = fsid->val[0] ^ fsid->val[1];
 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 	mp = cache[hash];
 	if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
 		goto slow;
 	if (vfs_busy(mp, 0) != 0) {
 		cache[hash] = NULL;
 		goto slow;
 	}
 	if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
 		return (mp);
 	else
 	    vfs_unbusy(mp);
 
 slow:
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
 			if (error) {
 				cache[hash] = NULL;
 				mtx_unlock(&mountlist_mtx);
 				return (NULL);
 			}
 			cache[hash] = mp;
 			return (mp);
 		}
 	}
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if (jailed(td->td_ucred)) {
 		/*
 		 * If the jail of the calling thread lacks permission for
 		 * this type of file system, deny immediately.
 		 */
 		if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
 			return (EPERM);
 
 		/*
 		 * If the file system was mounted outside the jail of the
 		 * calling thread, deny immediately.
 		 */
 		if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 			return (EPERM);
 	}
 
 	/*
 	 * If file system supports delegated administration, we don't check
 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 	 * by the file system itself.
 	 * If this is not the user that did original mount, we check for
 	 * the PRIV_VFS_MOUNT_OWNER privilege.
 	 */
 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static uint16_t mntid_base;
 	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
 		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_USEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
     "3+: sec + ns (max. precision))");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * Try to reduce the total number of vnodes.
  *
  * This routine (and its user) are buggy in at least the following ways:
  * - all parameters were picked years ago when RAM sizes were significantly
  *   smaller
  * - it can pick vnodes based on pages used by the vm object, but filesystems
  *   like ZFS don't use it making the pick broken
  * - since ZFS has its own aging policy it gets partially combated by this one
  * - a dedicated method should be provided for filesystems to let them decide
  *   whether the vnode should be recycled
  *
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desirable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  *
  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
  * 			 entries if this argument is strue
  * @param trigger	 Only reclaim vnodes with fewer than this many resident
  *			 pages.
  * @param target	 How many vnodes to reclaim.
  * @return		 The number of vnodes that were reclaimed.
  */
 static int
 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
 {
 	struct vnode *vp, *mvp;
 	struct mount *mp;
 	struct vm_object *object;
 	u_long done;
 	bool retried;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 
 	retried = false;
 	done = 0;
 
 	mvp = vnode_list_reclaim_marker;
 restart:
 	vp = mvp;
 	while (done < target) {
 		vp = TAILQ_NEXT(vp, v_vnodelist);
 		if (__predict_false(vp == NULL))
 			break;
 
 		if (__predict_false(vp->v_type == VMARKER))
 			continue;
 
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 * Also skip free vnodes.  We are trying to make space
 		 * to expand the free list, not reduce it.
 		 */
 		if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
 			goto next_iter;
 
 		if (vp->v_type == VBAD || vp->v_type == VNON)
 			goto next_iter;
 
 		object = atomic_load_ptr(&vp->v_object);
 		if (object == NULL || object->resident_page_count > trigger) {
 			goto next_iter;
 		}
 
 		/*
 		 * Handle races against vnode allocation. Filesystems lock the
 		 * vnode some time after it gets returned from getnewvnode,
 		 * despite type and hold count being manipulated earlier.
 		 * Resorting to checking v_mount restores guarantees present
 		 * before the global list was reworked to contain all vnodes.
 		 */
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		if (vp->v_mount == NULL) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		vholdl(vp);
 		VI_UNLOCK(vp);
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		mtx_unlock(&vnode_list_mtx);
 
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			vdrop_recycle(vp);
 			goto next_iter_unlocked;
 		}
 		if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
 			vdrop_recycle(vp);
 			vn_finished_write(mp);
 			goto next_iter_unlocked;
 		}
 
 		VI_LOCK(vp);
 		if (vp->v_usecount > 0 ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 		    (vp->v_object != NULL && vp->v_object->handle == vp &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp);
 			vdropl_recycle(vp);
 			vn_finished_write(mp);
 			goto next_iter_unlocked;
 		}
 		counter_u64_add(recycles_count, 1);
 		vgonel(vp);
 		VOP_UNLOCK(vp);
 		vdropl_recycle(vp);
 		vn_finished_write(mp);
 		done++;
 next_iter_unlocked:
 		maybe_yield();
 		mtx_lock(&vnode_list_mtx);
 		goto restart;
 next_iter:
 		MPASS(vp->v_type != VMARKER);
 		if (!should_yield())
 			continue;
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		mtx_unlock(&vnode_list_mtx);
 		kern_yield(PRI_USER);
 		mtx_lock(&vnode_list_mtx);
 		goto restart;
 	}
 	if (done == 0 && !retried) {
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
 		retried = true;
 		goto restart;
 	}
 	return (done);
 }
 
 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
     0,
     "limit on vnode free requests per call to the vnlru_free routine");
 
 /*
  * Attempt to reduce the free list by the requested amount.
  */
 static int
 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int ocount;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (count > max_vnlru_free)
 		count = max_vnlru_free;
 	ocount = count;
 	vp = mvp;
 	for (;;) {
 		if (count == 0) {
 			break;
 		}
 		vp = TAILQ_NEXT(vp, v_vnodelist);
 		if (__predict_false(vp == NULL)) {
 			TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 			TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
 			break;
 		}
 		if (__predict_false(vp->v_type == VMARKER))
 			continue;
 		if (vp->v_holdcnt > 0)
 			continue;
 		/*
 		 * Don't recycle if our vnode is from different type
 		 * of mount point.  Note that mp is type-safe, the
 		 * check does not reach unmapped address even if
 		 * vnode is reclaimed.
 		 */
 		if (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
 		    mp->mnt_op != mnt_op) {
 			continue;
 		}
 		if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 			continue;
 		}
 		if (!vhold_recycle_free(vp))
 			continue;
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		mtx_unlock(&vnode_list_mtx);
 		/*
 		 * FIXME: ignores the return value, meaning it may be nothing
 		 * got recycled but it claims otherwise to the caller.
 		 *
 		 * Originally the value started being ignored in 2005 with
 		 * 114a1006a8204aa156e1f9ad6476cdff89cada7f .
 		 *
 		 * Respecting the value can run into significant stalls if most
 		 * vnodes belong to one file system and it has writes
 		 * suspended.  In presence of many threads and millions of
 		 * vnodes they keep contending on the vnode_list_mtx lock only
 		 * to find vnodes they can't recycle.
 		 *
 		 * The solution would be to pre-check if the vnode is likely to
 		 * be recycle-able, but it needs to happen with the
 		 * vnode_list_mtx lock held. This runs into a problem where
 		 * VOP_GETWRITEMOUNT (currently needed to find out about if
 		 * writes are frozen) can take locks which LOR against it.
 		 *
 		 * Check nullfs for one example (null_getwritemount).
 		 */
 		vtryrecycle(vp);
 		count--;
 		mtx_lock(&vnode_list_mtx);
 		vp = mvp;
 	}
 	return (ocount - count);
 }
 
 static int
 vnlru_free_locked(int count)
 {
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	return (vnlru_free_impl(count, NULL, vnode_list_free_marker));
 }
 
 void
 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
 {
 
 	MPASS(mnt_op != NULL);
 	MPASS(mvp != NULL);
 	VNPASS(mvp->v_type == VMARKER, mvp);
 	mtx_lock(&vnode_list_mtx);
 	vnlru_free_impl(count, mnt_op, mvp);
 	mtx_unlock(&vnode_list_mtx);
 }
 
 struct vnode *
 vnlru_alloc_marker(void)
 {
 	struct vnode *mvp;
 
 	mvp = vn_alloc_marker(NULL);
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	return (mvp);
 }
 
 void
 vnlru_free_marker(struct vnode *mvp)
 {
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	vn_free_marker(mvp);
 }
 
 static void
 vnlru_recalc(void)
 {
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
 	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
 	vlowat = vhiwat / 2;
 }
 
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 /*
  * The main freevnodes counter is only updated when threads requeue their vnode
  * batches. CPUs are conditionally walked to compute a more accurate total.
  *
  * Limit how much of a slop are we willing to tolerate. Note: the actual value
  * at any given moment can still exceed slop, but it should not be by significant
  * margin in practice.
  */
 #define VNLRU_FREEVNODES_SLOP 128
 
 static __inline void
 vfs_freevnodes_inc(void)
 {
 	struct vdbatch *vd;
 
 	critical_enter();
 	vd = DPCPU_PTR(vd);
 	vd->freevnodes++;
 	critical_exit();
 }
 
 static __inline void
 vfs_freevnodes_dec(void)
 {
 	struct vdbatch *vd;
 
 	critical_enter();
 	vd = DPCPU_PTR(vd);
 	vd->freevnodes--;
 	critical_exit();
 }
 
 static u_long
 vnlru_read_freevnodes(void)
 {
 	struct vdbatch *vd;
 	long slop;
 	int cpu;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (freevnodes > freevnodes_old)
 		slop = freevnodes - freevnodes_old;
 	else
 		slop = freevnodes_old - freevnodes;
 	if (slop < VNLRU_FREEVNODES_SLOP)
 		return (freevnodes >= 0 ? freevnodes : 0);
 	freevnodes_old = freevnodes;
 	CPU_FOREACH(cpu) {
 		vd = DPCPU_ID_PTR((cpu), vd);
 		freevnodes_old += vd->freevnodes;
 	}
 	return (freevnodes_old >= 0 ? freevnodes_old : 0);
 }
 
 static bool
 vnlru_under(u_long rnumvnodes, u_long limit)
 {
 	u_long rfreevnodes, space;
 
 	if (__predict_false(rnumvnodes > desiredvnodes))
 		return (true);
 
 	space = desiredvnodes - rnumvnodes;
 	if (space < limit) {
 		rfreevnodes = vnlru_read_freevnodes();
 		if (rfreevnodes > wantfreevnodes)
 			space += rfreevnodes - wantfreevnodes;
 	}
 	return (space < limit);
 }
 
 static bool
 vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
 {
 	long rfreevnodes, space;
 
 	if (__predict_false(rnumvnodes > desiredvnodes))
 		return (true);
 
 	space = desiredvnodes - rnumvnodes;
 	if (space < limit) {
 		rfreevnodes = atomic_load_long(&freevnodes);
 		if (rfreevnodes > wantfreevnodes)
 			space += rfreevnodes - wantfreevnodes;
 	}
 	return (space < limit);
 }
 
 static void
 vnlru_kick(void)
 {
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (vnlruproc_sig == 0) {
 		vnlruproc_sig = 1;
 		wakeup(vnlruproc);
 	}
 }
 
 static void
 vnlru_proc(void)
 {
 	u_long rnumvnodes, rfreevnodes, target;
 	unsigned long onumvnodes;
 	int done, force, trigger, usevnodes;
 	bool reclaim_nc_src, want_reread;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 	    SHUTDOWN_PRI_FIRST);
 
 	force = 0;
 	want_reread = false;
 	for (;;) {
 		kproc_suspend_check(vnlruproc);
 		mtx_lock(&vnode_list_mtx);
 		rnumvnodes = atomic_load_long(&numvnodes);
 
 		if (want_reread) {
 			force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
 			want_reread = false;
 		}
 
 		/*
 		 * If numvnodes is too large (due to desiredvnodes being
 		 * adjusted using its sysctl, or emergency growth), first
 		 * try to reduce it by discarding from the free list.
 		 */
 		if (rnumvnodes > desiredvnodes) {
 			vnlru_free_locked(rnumvnodes - desiredvnodes);
 			rnumvnodes = atomic_load_long(&numvnodes);
 		}
 		/*
 		 * Sleep if the vnode cache is in a good state.  This is
 		 * when it is not over-full and has space for about a 4%
 		 * or 9% expansion (by growing its size or inexcessively
 		 * reducing its free list).  Otherwise, try to reclaim
 		 * space for a 10% expansion.
 		 */
 		if (vstir && force == 0) {
 			force = 1;
 			vstir = 0;
 		}
 		if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		rfreevnodes = vnlru_read_freevnodes();
 
 		onumvnodes = rnumvnodes;
 		/*
 		 * Calculate parameters for recycling.  These are the same
 		 * throughout the loop to give some semblance of fairness.
 		 * The trigger point is to avoid recycling vnodes with lots
 		 * of resident pages.  We aren't trying to free memory; we
 		 * are trying to recycle or at least free vnodes.
 		 */
 		if (rnumvnodes <= desiredvnodes)
 			usevnodes = rnumvnodes - rfreevnodes;
 		else
 			usevnodes = rnumvnodes;
 		if (usevnodes <= 0)
 			usevnodes = 1;
 		/*
 		 * The trigger value is chosen to give a conservatively
 		 * large value to ensure that it alone doesn't prevent
 		 * making progress.  The value can easily be so large that
 		 * it is effectively infinite in some congested and
 		 * misconfigured cases, and this is necessary.  Normally
 		 * it is about 8 to 100 (pages), which is quite large.
 		 */
 		trigger = vm_cnt.v_page_count * 2 / usevnodes;
 		if (force < 2)
 			trigger = vsmalltrigger;
 		reclaim_nc_src = force >= 3;
 		target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
 		target = target / 10 + 1;
 		done = vlrureclaim(reclaim_nc_src, trigger, target);
 		mtx_unlock(&vnode_list_mtx);
 		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
 			uma_reclaim(UMA_RECLAIM_DRAIN);
 		if (done == 0) {
 			if (force == 0 || force == 1) {
 				force = 2;
 				continue;
 			}
 			if (force == 2) {
 				force = 3;
 				continue;
 			}
 			want_reread = true;
 			force = 0;
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else {
 			want_reread = true;
 			kern_yield(PRI_USER);
 		}
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
     &vnlru_kp);
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct mount *vnmp;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, vp %p lock is already held",
 		    __func__, vp);
 		vdrop_recycle(vp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, cannot start the write for %p",
 		    __func__, vp);
 		vdrop_recycle(vp);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp);
 		vdropl_recycle(vp);
 		vn_finished_write(vnmp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, %p is already referenced",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	if (!VN_IS_DOOMED(vp)) {
 		counter_u64_add(recycles_free_count, 1);
 		vgonel(vp);
 	}
 	VOP_UNLOCK(vp);
 	vdropl_recycle(vp);
 	vn_finished_write(vnmp);
 	return (0);
 }
 
 /*
  * Allocate a new vnode.
  *
  * The operation never returns an error. Returning an error was disabled
  * in r145385 (dated 2005) with the following comment:
  *
  * XXX Not all VFS_VGET/ffs_vget callers check returns.
  *
  * Given the age of this commit (almost 15 years at the time of writing this
  * comment) restoring the ability to fail requires a significant audit of
  * all codepaths.
  *
  * The routine can try to free a vnode or stall for up to 1 second waiting for
  * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
  */
 static u_long vn_alloc_cyclecount;
 
 static struct vnode * __noinline
 vn_alloc_hard(struct mount *mp)
 {
 	u_long rnumvnodes, rfreevnodes;
 
 	mtx_lock(&vnode_list_mtx);
 	rnumvnodes = atomic_load_long(&numvnodes);
 	if (rnumvnodes + 1 < desiredvnodes) {
 		vn_alloc_cyclecount = 0;
 		goto alloc;
 	}
 	rfreevnodes = vnlru_read_freevnodes();
 	if (vn_alloc_cyclecount++ >= rfreevnodes) {
 		vn_alloc_cyclecount = 0;
 		vstir = 1;
 	}
 	/*
 	 * Grow the vnode cache if it will not be above its target max
 	 * after growing.  Otherwise, if the free list is nonempty, try
 	 * to reclaim 1 item from it before growing the cache (possibly
 	 * above its target max if the reclamation failed or is delayed).
 	 * Otherwise, wait for some space.  In all cases, schedule
 	 * vnlru_proc() if we are getting short of space.  The watermarks
 	 * should be chosen so that we never wait or even reclaim from
 	 * the free list to below its target minimum.
 	 */
 	if (vnlru_free_locked(1) > 0)
 		goto alloc;
 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 		/*
 		 * Wait for space for a new vnode.
 		 */
 		vnlru_kick();
 		msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
 		if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
 		    vnlru_read_freevnodes() > 1)
 			vnlru_free_locked(1);
 	}
 alloc:
 	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 	if (vnlru_under(rnumvnodes, vlowat))
 		vnlru_kick();
 	mtx_unlock(&vnode_list_mtx);
 	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 
 static struct vnode *
 vn_alloc(struct mount *mp)
 {
 	u_long rnumvnodes;
 
 	if (__predict_false(vn_alloc_cyclecount != 0))
 		return (vn_alloc_hard(mp));
 	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 	if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) {
 		atomic_subtract_long(&numvnodes, 1);
 		return (vn_alloc_hard(mp));
 	}
 
 	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 
 static void
 vn_free(struct vnode *vp)
 {
 
 	atomic_subtract_long(&numvnodes, 1);
 	uma_zfree_smr(vnode_zone, vp);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct lock_object *lo;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
 
 	KASSERT(vops->registered,
 	    ("%s: not registered vector op %p\n", __func__, vops));
 
 	td = curthread;
 	if (td->td_vp_reserved != NULL) {
 		vp = td->td_vp_reserved;
 		td->td_vp_reserved = NULL;
 	} else {
 		vp = vn_alloc(mp);
 	}
 	counter_u64_add(vnodes_created, 1);
 
 	vn_set_state(vp, VSTATE_UNINITIALIZED);
 
 	/*
 	 * Locks are given the generic name "vnode" when created.
 	 * Follow the historic practice of using the filesystem
 	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
 	 *
 	 * Locks live in a witness group keyed on their name. Thus,
 	 * when a lock is renamed, it must also move from the witness
 	 * group of its old name to the witness group of its new name.
 	 *
 	 * The change only needs to be made when the vnode moves
 	 * from one filesystem type to another. We ensure that each
 	 * filesystem use a single static name pointer for its tag so
 	 * that we can compare pointers rather than doing a strcmp().
 	 */
 	lo = &vp->v_vnlock->lock_object;
 #ifdef WITNESS
 	if (lo->lo_name != tag) {
 #endif
 		lo->lo_name = tag;
 #ifdef WITNESS
 		WITNESS_DESTROY(lo);
 		WITNESS_INIT(lo, tag);
 	}
 #endif
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
 	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
 	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 	vp->v_type = VNON;
 	vp->v_op = vops;
 	vp->v_irflag = 0;
 	v_init_counters(vp);
 	vn_seqc_init(vp);
 	vp->v_bufobj.bo_ops = &buf_ops_bio;
 #ifdef DIAGNOSTIC
 	if (mp == NULL && vops != &dead_vnodeops)
 		printf("NULL mp in getnewvnode(9), tag %s\n", tag);
 #endif
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_vnode_associate_singlelabel(mp, vp);
 #endif
 	if (mp != NULL) {
 		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 	}
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
 	 * still initialize v_hash to have vfs_hash_index() useful.
 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
 	 * its own hashing.
 	 */
 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
 
 	*vpp = vp;
 	return (0);
 }
 
 void
 getnewvnode_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(td->td_vp_reserved == NULL);
 	td->td_vp_reserved = vn_alloc(NULL);
 }
 
 void
 getnewvnode_drop_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_vp_reserved != NULL) {
 		vn_free(td->td_vp_reserved);
 		td->td_vp_reserved = NULL;
 	}
 }
 
 static void __noinline
 freevnode(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	/*
 	 * The vnode has been marked for destruction, so free it.
 	 *
 	 * The vnode will be returned to the zone where it will
 	 * normally remain until it is needed for another vnode. We
 	 * need to cleanup (or verify that the cleanup has already
 	 * been done) any residual data left from its current use
 	 * so as not to contaminate the freshly allocated vnode.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	/*
 	 * Paired with vgone.
 	 */
 	vn_seqc_write_end_free(vp);
 
 	bo = &vp->v_bufobj;
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
 	    ("clean blk trie not empty"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 	    ("dirty blk trie not empty"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
 	    ("Dangling rangelock waiters"));
 	VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
 	    ("Leaked inactivation"));
 	VI_UNLOCK(vp);
 	cache_assert_no_entries(vp);
 
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		/*
 		 * Use LK_NOWAIT to shut up witness about the lock. We may get
 		 * here while having another vnode locked when trying to
 		 * satisfy a lookup and needing to recycle.
 		 */
 		VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		destroy_vpollinfo(vp->v_pollinfo);
 		VOP_UNLOCK(vp);
 		vp->v_pollinfo = NULL;
 	}
 	vp->v_mountedhere = NULL;
 	vp->v_unpcb = NULL;
 	vp->v_rdev = NULL;
 	vp->v_fifoinfo = NULL;
 	vp->v_iflag = 0;
 	vp->v_vflag = 0;
 	bo->bo_flag = 0;
 	vn_free(vp);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 
 	mp = vp->v_mount;
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	vp->v_mount = NULL;
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	/*
 	 * The caller expects the interlock to be still held.
 	 */
 	ASSERT_VI_LOCKED(vp, __func__);
 }
 
 static int
 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr)
 {
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) {
 		ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
 	} else {
 		KASSERT(!dtr,
 		    ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup",
 		    __func__));
 	}
 
 	/*
 	 * We acquire the vnode interlock early to ensure that the
 	 * vnode cannot be recycled by another process releasing a
 	 * holdcnt on it before we get it on both the vnode list
 	 * and the active vnode list. The mount mutex protects only
 	 * manipulation of the vnode list and the vnode freelist
 	 * mutex protects only manipulation of the active vnode list.
 	 * Hence the need to hold the vnode interlock throughout.
 	 */
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
 	    mp->mnt_nvnodelistsize == 0)) &&
 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
 		VI_UNLOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (dtr) {
 			vp->v_data = NULL;
 			vp->v_op = &dead_vnodeops;
 			vgone(vp);
 			vput(vp);
 		}
 		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	VI_UNLOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  * insmntque() reclaims the vnode on insertion failure, insmntque1()
  * leaves handling of the vnode to the caller.
  */
 int
 insmntque(struct vnode *vp, struct mount *mp)
 {
 	return (insmntque1_int(vp, mp, true));
 }
 
 int
 insmntque1(struct vnode *vp, struct mount *mp)
 {
 	return (insmntque1_int(vp, mp, false));
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			do {
 				error = BO_SYNC(bo, MNT_WAIT);
 			} while (error == ERELOOKUP);
 			if (error != 0)
 				return (error);
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
 				BO_UNLOCK(bo);
 				return (EBUSY);
 			}
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0 && !(flags & V_CLEANONLY))
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
 			BO_UNLOCK(bo);
 			vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
 			BO_LOCK(bo);
 		}
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL &&
 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
 		VM_OBJECT_WLOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
 		VM_OBJECT_WUNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
 	    V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
 	    bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
 	    bo->bo_dirty.bv_cnt > 0)
 		panic("vinvalbuf: flush dirty failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
 {
 
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	if (vp->v_object != NULL && vp->v_object->handle != vp)
 		return (0);
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		/*
 		 * If we are flushing both V_NORMAL and V_ALT buffers then
 		 * do not skip any buffers. If we are flushing only V_NORMAL
 		 * buffers then skip buffers marked as BX_ALTDATA. If we are
 		 * flushing only V_ALT buffers then skip buffers not marked
 		 * as BX_ALTDATA.
 		 */
 		if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
 		   (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
 			continue;
 		}
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp == NULL)
 			break;
 		nbp = gbincore(bo, lblkno);
 		if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		    != xflags)
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 int
 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
 {
 	struct buf *bp;
 	int error;
 	daddr_t lblkno;
 
 	ASSERT_BO_LOCKED(bo);
 
 	for (lblkno = startn;;) {
 again:
 		bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
 		if (bp == NULL || bp->b_lblkno >= endn ||
 		    bp->b_lblkno < startn)
 			break;
 		error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 		    LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
 		if (error != 0) {
 			BO_RLOCK(bo);
 			if (error == ENOLCK)
 				goto again;
 			return (error);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		lblkno = bp->b_lblkno + 1;
 		if ((bp->b_flags & B_MANAGED) == 0)
 			bremfree(bp);
 		bp->b_flags |= B_RELBUF;
 		/*
 		 * In the VMIO case, use the B_NOREUSE flag to hint that the
 		 * pages backing each buffer in the range are unlikely to be
 		 * reused.  Dirty buffers will have the hint applied once
 		 * they've been written.
 		 */
 		if ((bp->b_flags & B_VMIO) != 0)
 			bp->b_flags |= B_NOREUSE;
 		brelse(bp);
 		BO_RLOCK(bo);
 	}
 	return (0);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 	daddr_t startlbn;
 
 	CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
 	    vp, blksize, (uintmax_t)length);
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	startlbn = howmany(length, blksize);
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 
 	bo = &vp->v_bufobj;
 restart_unlocked:
 	BO_LOCK(bo);
 
 	while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
 		;
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart_unlocked;
 
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			BO_LOCK(bo);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	BO_UNLOCK(bo);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * Invalidate the cached pages of a file's buffer within the range of block
  * numbers [startlbn, endlbn).
  */
 void
 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
     int blksize)
 {
 	struct bufobj *bo;
 	off_t start, end;
 
 	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
 
 	start = blksize * startlbn;
 	end = blksize * endlbn;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	MPASS(blksize == bo->bo_bsize);
 
 	while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
 		;
 
 	BO_UNLOCK(bo);
 	vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
 }
 
 static int
 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
     daddr_t startlbn, daddr_t endlbn)
 {
 	struct buf *bp, *nbp;
 	bool anyfreed;
 
 	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
 	ASSERT_BO_LOCKED(bo);
 
 	do {
 		anyfreed = false;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				BO_LOCK(bo);
 				return (EAGAIN);
 			}
 
 			bremfree(bp);
 			bp->b_flags |= B_INVAL | B_RELBUF;
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = true;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    nbp->b_vp != vp ||
 			    (nbp->b_flags & B_DELWRI) != 0))
 				return (EAGAIN);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				BO_LOCK(bo);
 				return (EAGAIN);
 			}
 			bremfree(bp);
 			bp->b_flags |= B_INVAL | B_RELBUF;
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = true;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0))
 				return (EAGAIN);
 		}
 	} while (anyfreed);
 	return (0);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct bufv *bv;
 	b_xflags_t flags;
 
 	flags = bp->b_xflags;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_WLOCKED(bp->b_bufobj);
 	KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
 	    (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
 	    ("%s: buffer %p has invalid queue state", __func__, bp));
 
 	if ((flags & BX_VNDIRTY) != 0)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct bufv *bv;
 	struct buf *n;
 	int error;
 
 	ASSERT_BO_WLOCKED(bo);
 	KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
 	    ("buf_vlist_add: bo %p does not allow bufs", bo));
 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
 	    ("dead bo %p", bo));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	/*
 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
 	 * we tend to grow at the tail so lookup_le should usually be cheaper
 	 * than _ge. 
 	 */
 	if (bv->bv_cnt == 0 ||
 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 	else
 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
 	if (error)
 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
 	bv->bv_cnt++;
 }
 
 /*
  * Look up a buffer using the buffer tries.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
 }
 
 /*
  * Look up a buf using the buffer tries, without the bufobj lock.  This relies
  * on SMR for safe lookup, and bufs being in a no-free zone to provide type
  * stability of the result.  Like other lockless lookups, the found buf may
  * already be invalid by the time this function returns.
  */
 struct buf *
 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_UNLOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	ASSERT_BO_WLOCKED(bo);
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = bo;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, bo, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	buf_vlist_remove(bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	BO_UNLOCK(bo);
 	vdrop(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
 		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
     CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 
 static int
 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	*bo = LIST_FIRST(slp);
 	if (*bo == NULL)
 		return (0);
 	vp = bo2vnode(*bo);
 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (*bo == LIST_FIRST(slp));
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	BO_LOCK(*bo);
 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
 	BO_UNLOCK(*bo);
 	vdrop(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 static int first_printf = 1;
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next, *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	mtx_lock(&sync_mtx);
 	for (;;) {
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kproc_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining... ");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while (!LIST_EMPTY(slp)) {
 			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 
 			if (first_printf == 0) {
 				/*
 				 * Drop the sync mutex, because some watchdog
 				 * drivers need to sleep while patting
 				 */
 				mtx_unlock(&sync_mtx);
 				wdog_kern_pat(WD_LASTVAL);
 				mtx_lock(&sync_mtx);
 			}
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING ||
 		    time_uptime == starttime) {
 			thread_lock(td);
 			sched_prio(td, PPAUSE);
 			thread_unlock(td);
 		}
 		if (syncer_state != SYNCER_RUNNING)
 			cv_timedwait(&sync_wakeup, &sync_mtx,
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	int ret = 0;
 
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 
 	if (howto & RB_NOSYNC)
 		return;
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_shutdown(arg, howto);
 }
 
 void
 syncer_suspend(void)
 {
 
 	syncer_shutdown(updateproc, 0);
 }
 
 void
 syncer_resume(void)
 {
 
 	mtx_lock(&sync_mtx);
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_resume(updateproc);
 }
 
 /*
  * Move the buffer between the clean and dirty lists of its vnode.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 
 	KASSERT((bp->b_flags & B_PAGING) == 0,
 	    ("%s: cannot reassign paging buffer %p", __func__, bp));
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 
 	BO_LOCK(bo);
 	buf_vlist_remove(bp);
 
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	BO_UNLOCK(bo);
 }
 
 static void
 v_init_counters(struct vnode *vp)
 {
 
 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
 
 	refcount_init(&vp->v_holdcnt, 1);
 	refcount_init(&vp->v_usecount, 1);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it.  VIRF_DOOMED is set if the vnode
  * is being destroyed.  Only callers who specify LK_RETRY will
  * see doomed vnodes.  If inactive processing was delayed in
  * vput try to do it here.
  *
  * usecount is manipulated using atomics without holding any locks.
  *
  * holdcnt can be manipulated using atomics without holding any locks,
  * except when transitioning 1<->0, in which case the interlock is held.
  *
  * Consumers which don't guarantee liveness of the vnode can use SMR to
  * try to get a reference. Note this operation can fail since the vnode
  * may be awaiting getting freed by the time they get to it.
  */
 enum vgetstate
 vget_prep_smr(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		vs = VGET_USECOUNT;
 	} else {
 		if (vhold_smr(vp))
 			vs = VGET_HOLDCNT;
 		else
 			vs = VGET_NONE;
 	}
 	return (vs);
 }
 
 enum vgetstate
 vget_prep(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		vs = VGET_USECOUNT;
 	} else {
 		vhold(vp);
 		vs = VGET_HOLDCNT;
 	}
 	return (vs);
 }
 
 void
 vget_abort(struct vnode *vp, enum vgetstate vs)
 {
 
 	switch (vs) {
 	case VGET_USECOUNT:
 		vrele(vp);
 		break;
 	case VGET_HOLDCNT:
 		vdrop(vp);
 		break;
 	default:
 		__assert_unreachable();
 	}
 }
 
 int
 vget(struct vnode *vp, int flags)
 {
 	enum vgetstate vs;
 
 	vs = vget_prep(vp);
 	return (vget_finish(vp, flags, vs));
 }
 
 int
 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
 {
 	int error;
 
 	if ((flags & LK_INTERLOCK) != 0)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 
 	error = vn_lock(vp, flags);
 	if (__predict_false(error != 0)) {
 		vget_abort(vp, vs);
 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 		    vp);
 		return (error);
 	}
 
 	vget_finish_ref(vp, vs);
 	return (0);
 }
 
 void
 vget_finish_ref(struct vnode *vp, enum vgetstate vs)
 {
 	int old;
 
 	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 
 	if (vs == VGET_USECOUNT)
 		return;
 
 	/*
 	 * We hold the vnode. If the usecount is 0 it will be utilized to keep
 	 * the vnode around. Otherwise someone else lended their hold count and
 	 * we have to drop ours.
 	 */
 	old = atomic_fetchadd_int(&vp->v_usecount, 1);
 	VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
 	if (old != 0) {
 #ifdef INVARIANTS
 		old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
 		VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
 #else
 		refcount_release(&vp->v_holdcnt);
 #endif
 	}
 }
 
 void
 vref(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vs = vget_prep(vp);
 	vget_finish_ref(vp, vs);
 }
 
 void
 vrefact(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_usecount, 1);
 	VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
 #else
 	refcount_acquire(&vp->v_usecount);
 #endif
 }
 
 void
 vlazy(struct vnode *vp)
 {
 	struct mount *mp;
 
 	VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
 
 	if ((vp->v_mflag & VMP_LAZYLIST) != 0)
 		return;
 	/*
 	 * We may get here for inactive routines after the vnode got doomed.
 	 */
 	if (VN_IS_DOOMED(vp))
 		return;
 	mp = vp->v_mount;
 	mtx_lock(&mp->mnt_listmtx);
 	if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
 		vp->v_mflag |= VMP_LAZYLIST;
 		TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 		mp->mnt_lazyvnodelistsize++;
 	}
 	mtx_unlock(&mp->mnt_listmtx);
 }
 
 static void
 vunlazy(struct vnode *vp)
 {
 	struct mount *mp;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(!VN_IS_DOOMED(vp), vp);
 
 	mp = vp->v_mount;
 	mtx_lock(&mp->mnt_listmtx);
 	VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 	/*
 	 * Don't remove the vnode from the lazy list if another thread
 	 * has increased the hold count. It may have re-enqueued the
 	 * vnode to the lazy list and is now responsible for its
 	 * removal.
 	 */
 	if (vp->v_holdcnt == 0) {
 		vp->v_mflag &= ~VMP_LAZYLIST;
 		TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 		mp->mnt_lazyvnodelistsize--;
 	}
 	mtx_unlock(&mp->mnt_listmtx);
 }
 
 /*
  * This routine is only meant to be called from vgonel prior to dooming
  * the vnode.
  */
 static void
 vunlazy_gone(struct vnode *vp)
 {
 	struct mount *mp;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(!VN_IS_DOOMED(vp), vp);
 
 	if (vp->v_mflag & VMP_LAZYLIST) {
 		mp = vp->v_mount;
 		mtx_lock(&mp->mnt_listmtx);
 		VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 		vp->v_mflag &= ~VMP_LAZYLIST;
 		TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 		mp->mnt_lazyvnodelistsize--;
 		mtx_unlock(&mp->mnt_listmtx);
 	}
 }
 
 static void
 vdefer_inactive(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT(vp->v_holdcnt > 0, vp,
 	    ("%s: vnode without hold count", __func__));
 	if (VN_IS_DOOMED(vp)) {
 		vdropl(vp);
 		return;
 	}
 	if (vp->v_iflag & VI_DEFINACT) {
 		VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 		vdropl(vp);
 		return;
 	}
 	if (vp->v_usecount > 0) {
 		vp->v_iflag &= ~VI_OWEINACT;
 		vdropl(vp);
 		return;
 	}
 	vlazy(vp);
 	vp->v_iflag |= VI_DEFINACT;
 	VI_UNLOCK(vp);
 	counter_u64_add(deferred_inact, 1);
 }
 
 static void
 vdefer_inactive_unlocked(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_OWEINACT) == 0) {
 		vdropl(vp);
 		return;
 	}
 	vdefer_inactive(vp);
 }
 
 enum vput_op { VRELE, VPUT, VUNREF };
 
 /*
  * Handle ->v_usecount transitioning to 0.
  *
  * By releasing the last usecount we take ownership of the hold count which
  * provides liveness of the vnode, meaning we have to vdrop.
  *
  * For all vnodes we may need to perform inactive processing. It requires an
  * exclusive lock on the vnode, while it is legal to call here with only a
  * shared lock (or no locks). If locking the vnode in an expected manner fails,
  * inactive processing gets deferred to the syncer.
  *
  * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
  * on the lock being held all the way until VOP_INACTIVE. This in particular
  * happens with UFS which adds half-constructed vnodes to the hash, where they
  * can be found by other code.
  */
 static void
 vput_final(struct vnode *vp, enum vput_op func)
 {
 	int error;
 	bool want_unlock;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 
 	VI_LOCK(vp);
 
 	/*
 	 * By the time we got here someone else might have transitioned
 	 * the count back to > 0.
 	 */
 	if (vp->v_usecount > 0)
 		goto out;
 
 	/*
 	 * If the vnode is doomed vgone already performed inactive processing
 	 * (if needed).
 	 */
 	if (VN_IS_DOOMED(vp))
 		goto out;
 
 	if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
 		goto out;
 
 	if (vp->v_iflag & VI_DOINGINACT)
 		goto out;
 
 	/*
 	 * Locking operations here will drop the interlock and possibly the
 	 * vnode lock, opening a window where the vnode can get doomed all the
 	 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
 	 * perform inactive.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
 	want_unlock = false;
 	error = 0;
 	switch (func) {
 	case VRELE:
 		switch (VOP_ISLOCKED(vp)) {
 		case LK_EXCLUSIVE:
 			break;
 		case LK_EXCLOTHER:
 		case 0:
 			want_unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 			VI_LOCK(vp);
 			break;
 		default:
 			/*
 			 * The lock has at least one sharer, but we have no way
 			 * to conclude whether this is us. Play it safe and
 			 * defer processing.
 			 */
 			error = EAGAIN;
 			break;
 		}
 		break;
 	case VPUT:
 		want_unlock = true;
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
 			    LK_NOWAIT);
 			VI_LOCK(vp);
 		}
 		break;
 	case VUNREF:
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
 			VI_LOCK(vp);
 		}
 		break;
 	}
 	if (error == 0) {
 		if (func == VUNREF) {
 			VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
 			    ("recursive vunref"));
 			vp->v_vflag |= VV_UNREF;
 		}
 		for (;;) {
 			error = vinactive(vp);
 			if (want_unlock)
 				VOP_UNLOCK(vp);
 			if (error != ERELOOKUP || !want_unlock)
 				break;
 			VOP_LOCK(vp, LK_EXCLUSIVE);
 		}
 		if (func == VUNREF)
 			vp->v_vflag &= ~VV_UNREF;
 		vdropl(vp);
 	} else {
 		vdefer_inactive(vp);
 	}
 	return;
 out:
 	if (func == VPUT)
 		VOP_UNLOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Decrement ->v_usecount for a vnode.
  *
  * Releasing the last use count requires additional processing, see vput_final
  * above for details.
  *
  * Comment above each variant denotes lock state on entry and exit.
  */
 
 /*
  * in: any
  * out: same as passed in
  */
 void
 vrele(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount))
 		return;
 	vput_final(vp, VRELE);
 }
 
 /*
  * in: locked
  * out: unlocked
  */
 void
 vput(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount)) {
 		VOP_UNLOCK(vp);
 		return;
 	}
 	vput_final(vp, VPUT);
 }
 
 /*
  * in: locked
  * out: locked
  */
 void
 vunref(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount))
 		return;
 	vput_final(vp, VUNREF);
 }
 
 void
 vhold(struct vnode *vp)
 {
 	int old;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 	VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 	    ("%s: wrong hold count %d", __func__, old));
 	if (old == 0)
 		vfs_freevnodes_dec();
 }
 
 void
 vholdnz(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 	VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 	    ("%s: wrong hold count %d", __func__, old));
 #else
 	atomic_add_int(&vp->v_holdcnt, 1);
 #endif
 }
 
 /*
  * Grab a hold count unless the vnode is freed.
  *
  * Only use this routine if vfs smr is the only protection you have against
  * freeing the vnode.
  *
  * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
  * is not set.  After the flag is set the vnode becomes immutable to anyone but
  * the thread which managed to set the flag.
  *
  * It may be tempting to replace the loop with:
  * count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
  * if (count & VHOLD_NO_SMR) {
  *     backpedal and error out;
  * }
  *
  * However, while this is more performant, it hinders debugging by eliminating
  * the previously mentioned invariant.
  */
 bool
 vhold_smr(struct vnode *vp)
 {
 	int count;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	count = atomic_load_int(&vp->v_holdcnt);
 	for (;;) {
 		if (count & VHOLD_NO_SMR) {
 			VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 			    ("non-zero hold count with flags %d\n", count));
 			return (false);
 		}
 		VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 		if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
 			if (count == 0)
 				vfs_freevnodes_dec();
 			return (true);
 		}
 	}
 }
 
 /*
  * Hold a free vnode for recycling.
  *
  * Note: vnode_init references this comment.
  *
  * Attempts to recycle only need the global vnode list lock and have no use for
  * SMR.
  *
  * However, vnodes get inserted into the global list before they get fully
  * initialized and stay there until UMA decides to free the memory. This in
  * particular means the target can be found before it becomes usable and after
  * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to
  * VHOLD_NO_SMR.
  *
  * Note: the vnode may gain more references after we transition the count 0->1.
  */
 static bool
 vhold_recycle_free(struct vnode *vp)
 {
 	int count;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 
 	count = atomic_load_int(&vp->v_holdcnt);
 	for (;;) {
 		if (count & VHOLD_NO_SMR) {
 			VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 			    ("non-zero hold count with flags %d\n", count));
 			return (false);
 		}
 		VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 		if (count > 0) {
 			return (false);
 		}
 		if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
 			vfs_freevnodes_dec();
 			return (true);
 		}
 	}
 }
 
 static void __noinline
 vdbatch_process(struct vdbatch *vd)
 {
 	struct vnode *vp;
 	int i;
 
 	mtx_assert(&vd->lock, MA_OWNED);
 	MPASS(curthread->td_pinned > 0);
 	MPASS(vd->index == VDBATCH_SIZE);
 
 	mtx_lock(&vnode_list_mtx);
 	critical_enter();
 	freevnodes += vd->freevnodes;
 	for (i = 0; i < VDBATCH_SIZE; i++) {
 		vp = vd->tab[i];
 		TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 		TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
 		MPASS(vp->v_dbatchcpu != NOCPU);
 		vp->v_dbatchcpu = NOCPU;
 	}
 	mtx_unlock(&vnode_list_mtx);
 	vd->freevnodes = 0;
 	bzero(vd->tab, sizeof(vd->tab));
 	vd->index = 0;
 	critical_exit();
 }
 
 static void
 vdbatch_enqueue(struct vnode *vp)
 {
 	struct vdbatch *vd;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT(!VN_IS_DOOMED(vp), vp,
 	    ("%s: deferring requeue of a doomed vnode", __func__));
 
 	if (vp->v_dbatchcpu != NOCPU) {
 		VI_UNLOCK(vp);
 		return;
 	}
 
 	sched_pin();
 	vd = DPCPU_PTR(vd);
 	mtx_lock(&vd->lock);
 	MPASS(vd->index < VDBATCH_SIZE);
 	MPASS(vd->tab[vd->index] == NULL);
 	/*
 	 * A hack: we depend on being pinned so that we know what to put in
 	 * ->v_dbatchcpu.
 	 */
 	vp->v_dbatchcpu = curcpu;
 	vd->tab[vd->index] = vp;
 	vd->index++;
 	VI_UNLOCK(vp);
 	if (vd->index == VDBATCH_SIZE)
 		vdbatch_process(vd);
 	mtx_unlock(&vd->lock);
 	sched_unpin();
 }
 
 /*
  * This routine must only be called for vnodes which are about to be
  * deallocated. Supporting dequeue for arbitrary vndoes would require
  * validating that the locked batch matches.
  */
 static void
 vdbatch_dequeue(struct vnode *vp)
 {
 	struct vdbatch *vd;
 	int i;
 	short cpu;
 
 	VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
 	    ("%s: called for a used vnode\n", __func__));
 
 	cpu = vp->v_dbatchcpu;
 	if (cpu == NOCPU)
 		return;
 
 	vd = DPCPU_ID_PTR(cpu, vd);
 	mtx_lock(&vd->lock);
 	for (i = 0; i < vd->index; i++) {
 		if (vd->tab[i] != vp)
 			continue;
 		vp->v_dbatchcpu = NOCPU;
 		vd->index--;
 		vd->tab[i] = vd->tab[vd->index];
 		vd->tab[vd->index] = NULL;
 		break;
 	}
 	mtx_unlock(&vd->lock);
 	/*
 	 * Either we dequeued the vnode above or the target CPU beat us to it.
 	 */
 	MPASS(vp->v_dbatchcpu == NOCPU);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
  * (marked VIRF_DOOMED) in which case we will free it.
  *
  * Because the vnode vm object keeps a hold reference on the vnode if
  * there is at least one resident non-cached page, the vnode cannot
  * leave the active list without the page cleanup done.
  */
 static void __noinline
 vdropl_final(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(VN_IS_DOOMED(vp), vp);
 	/*
 	 * Set the VHOLD_NO_SMR flag.
 	 *
 	 * We may be racing against vhold_smr. If they win we can just pretend
 	 * we never got this far, they will vdrop later.
 	 */
 	if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) {
 		vfs_freevnodes_inc();
 		VI_UNLOCK(vp);
 		/*
 		 * We lost the aforementioned race. Any subsequent access is
 		 * invalid as they might have managed to vdropl on their own.
 		 */
 		return;
 	}
 	/*
 	 * Don't bump freevnodes as this one is going away.
 	 */
 	freevnode(vp);
 }
 
 void
 vdrop(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (refcount_release_if_not_last(&vp->v_holdcnt))
 		return;
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 static void __always_inline
 vdropl_impl(struct vnode *vp, bool enqueue)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (!refcount_release(&vp->v_holdcnt)) {
 		VI_UNLOCK(vp);
 		return;
 	}
 	VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp);
 	VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp);
 	if (VN_IS_DOOMED(vp)) {
 		vdropl_final(vp);
 		return;
 	}
 
 	vfs_freevnodes_inc();
 	if (vp->v_mflag & VMP_LAZYLIST) {
 		vunlazy(vp);
 	}
 
 	if (!enqueue) {
 		VI_UNLOCK(vp);
 		return;
 	}
 
 	/*
 	 * Also unlocks the interlock. We can't assert on it as we
 	 * released our hold and by now the vnode might have been
 	 * freed.
 	 */
 	vdbatch_enqueue(vp);
 }
 
 void
 vdropl(struct vnode *vp)
 {
 
 	vdropl_impl(vp, true);
 }
 
 /*
  * vdrop a vnode when recycling
  *
  * This is a special case routine only to be used when recycling, differs from
  * regular vdrop by not requeieing the vnode on LRU.
  *
  * Consider a case where vtryrecycle continuously fails with all vnodes (due to
  * e.g., frozen writes on the filesystem), filling the batch and causing it to
  * be requeued. Then vnlru will end up revisiting the same vnodes. This is a
  * loop which can last for as long as writes are frozen.
  */
 static void
 vdropl_recycle(struct vnode *vp)
 {
 
 	vdropl_impl(vp, false);
 }
 
 static void
 vdrop_recycle(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl_recycle(vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  */
 static int
 vinactivef(struct vnode *vp)
 {
 	struct vm_object *obj;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	/*
 	 * Before moving off the active list, we must be sure that any
 	 * modified pages are converted into the vnode's dirty
 	 * buffers, since these will no longer be checked once the
 	 * vnode is on the inactive list.
 	 *
 	 * The write-out of the dirty pages is asynchronous.  At the
 	 * point that VOP_INACTIVE() is called, there could still be
 	 * pending I/O and dirty pages in the object.
 	 */
 	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 	    vm_object_mightbedirty(obj)) {
 		VM_OBJECT_WLOCK(obj);
 		vm_object_page_clean(obj, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	error = VOP_INACTIVE(vp);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 	return (error);
 }
 
 int
 vinactive(struct vnode *vp)
 {
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 
 	if ((vp->v_iflag & VI_OWEINACT) == 0)
 		return (0);
 	if (vp->v_iflag & VI_DOINGINACT)
 		return (0);
 	if (vp->v_usecount > 0) {
 		vp->v_iflag &= ~VI_OWEINACT;
 		return (0);
 	}
 	return (vinactivef(vp));
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
 #endif
 
 int
 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
 	    rootrefs, flags);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
 			    __func__, error);
 			return (error);
 		}
 		vput(rootvp);
 	}
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		vholdl(vp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 		if (error) {
 			vdrop(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp);
 			vdrop(vp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			if (vp->v_object != NULL) {
 				VM_OBJECT_WLOCK(vp->v_object);
 				vm_object_page_clean(vp->v_object, 0, 0, 0);
 				VM_OBJECT_WUNLOCK(vp->v_object);
 			}
 			do {
 				error = VOP_FSYNC(vp, MNT_WAIT, td);
 			} while (error == ERELOOKUP);
 			if (error != 0) {
 				VOP_UNLOCK(vp);
 				vdrop(vp);
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				return (error);
 			}
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount <= 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp);
 				vdropl(vp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vn_printf(vp, "vflush: busy vnode ");
 #endif
 		}
 		VOP_UNLOCK(vp);
 		vdropl(vp);
 	}
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy) {
 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
 		    busy);
 		return (EBUSY);
 	}
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp)
 {
 	int recycled;
 
 	VI_LOCK(vp);
 	recycled = vrecyclel(vp);
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * vrecycle, with the vp interlock held.
  */
 int
 vrecyclel(struct vnode *vp)
 {
 	int recycled;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	recycled = 0;
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Notify upper mounts about reclaimed or unlinked vnode.
  */
 void
 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event)
 {
 	struct mount *mp;
 	struct mount_upper_node *ump;
 
 	mp = atomic_load_ptr(&vp->v_mount);
 	if (mp == NULL)
 		return;
 	if (TAILQ_EMPTY(&mp->mnt_notify))
 		return;
 
 	MNT_ILOCK(mp);
 	mp->mnt_upper_pending++;
 	KASSERT(mp->mnt_upper_pending > 0,
 	    ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending));
 	TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) {
 		MNT_IUNLOCK(mp);
 		switch (event) {
 		case VFS_NOTIFY_UPPER_RECLAIM:
 			VFS_RECLAIM_LOWERVP(ump->mp, vp);
 			break;
 		case VFS_NOTIFY_UPPER_UNLINK:
 			VFS_UNLINK_LOWERVP(ump->mp, vp);
 			break;
 		}
 		MNT_ILOCK(mp);
 	}
 	mp->mnt_upper_pending--;
 	if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
 	    mp->mnt_upper_pending == 0) {
 		mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
 		wakeup(&mp->mnt_uppers);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 static void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	struct mount *mp;
 	vm_object_t object;
 	bool active, doinginact, oweinact;
 
 	ASSERT_VOP_ELOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (VN_IS_DOOMED(vp)) {
 		VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \
 		    vn_get_state(vp) == VSTATE_DEAD, vp);
 		return;
 	}
 	/*
 	 * Paired with freevnode.
 	 */
 	vn_seqc_write_begin_locked(vp);
 	vunlazy_gone(vp);
 	vn_irflag_set_locked(vp, VIRF_DOOMED);
 	vn_set_state(vp, VSTATE_DESTROYING);
 
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to
 	 * call VOP_CLOSE() and VOP_INACTIVE().
 	 *
 	 * It could be that VOP_INACTIVE() requested reclamation, in
 	 * which case we should avoid recursion, so check
 	 * VI_DOINGINACT.  This is not precise but good enough.
 	 */
 	active = vp->v_usecount > 0;
 	oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 	doinginact = (vp->v_iflag & VI_DOINGINACT) != 0;
 
 	/*
 	 * If we need to do inactive VI_OWEINACT will be set.
 	 */
 	if (vp->v_iflag & VI_DEFINACT) {
 		VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 		vp->v_iflag &= ~VI_DEFINACT;
 		vdropl(vp);
 	} else {
 		VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
 		VI_UNLOCK(vp);
 	}
 	cache_purge_vgone(vp);
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (!doinginact) {
 		do {
 			if (oweinact || active) {
 				VI_LOCK(vp);
 				vinactivef(vp);
 				oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 				VI_UNLOCK(vp);
 			}
 		} while (oweinact);
 	}
 	if (vp->v_type == VSOCK)
 		vfs_unp_reclaim(vp);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	mp = NULL;
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
 		while (vinvalbuf(vp, 0, 0, 0) != 0)
 			;
 	}
 
 	BO_LOCK(&vp->v_bufobj);
 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
 	    ("vp %p bufobj not invalidated", vp));
 
 	/*
 	 * For VMIO bufobj, BO_DEAD is set later, or in
 	 * vm_object_terminate() after the object's page queue is
 	 * flushed.
 	 */
 	object = vp->v_bufobj.bo_object;
 	if (object == NULL)
 		vp->v_bufobj.bo_flag |= BO_DEAD;
 	BO_UNLOCK(&vp->v_bufobj);
 
 	/*
 	 * Handle the VM part.  Tmpfs handles v_object on its own (the
 	 * OBJT_VNODE check).  Nullfs or other bypassing filesystems
 	 * should not touch the object borrowed from the lower vnode
 	 * (the handle check).
 	 */
 	if (object != NULL && object->type == OBJT_VNODE &&
 	    object->handle == vp)
 		vnode_destroy_vobject(vp);
 
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp))
 		panic("vgone: cannot reclaim");
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p", vp));
 	/*
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	if (vp->v_lockf != NULL) {
 		(void)VOP_ADVLOCKPURGE(vp);
 		vp->v_lockf = NULL;
 	}
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	if (vp->v_mount == NULL) {
 		VI_LOCK(vp);
 	} else {
 		delmntque(vp);
 		ASSERT_VI_LOCKED(vp, "vgonel 2");
 	}
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_type = VBAD;
 	vn_set_state(vp, VSTATE_DEAD);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static const char *const vtypename[] = {
 	[VNON] = "VNON",
 	[VREG] = "VREG",
 	[VDIR] = "VDIR",
 	[VBLK] = "VBLK",
 	[VCHR] = "VCHR",
 	[VLNK] = "VLNK",
 	[VSOCK] = "VSOCK",
 	[VFIFO] = "VFIFO",
 	[VBAD] = "VBAD",
 	[VMARKER] = "VMARKER",
 };
 _Static_assert(nitems(vtypename) == VLASTTYPE + 1,
     "vnode type name not added to vtypename");
 
 static const char *const vstatename[] = {
 	[VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED",
 	[VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED",
 	[VSTATE_DESTROYING] = "VSTATE_DESTROYING",
 	[VSTATE_DEAD] = "VSTATE_DEAD",
 };
 _Static_assert(nitems(vstatename) == VLASTSTATE + 1,
     "vnode state name not added to vstatename");
 
 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
     "new hold count flag not added to vn_printf");
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[256], buf2[16];
 	u_long flags;
 	u_int holdcnt;
 	short irflag;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("type %s state %s\n", vtypename[vp->v_type], vstatename[vp->v_state]);
 	holdcnt = atomic_load_int(&vp->v_holdcnt);
 	printf("    usecount %d, writecount %d, refcount %d seqc users %d",
 	    vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
 	    vp->v_seqc_users);
 	switch (vp->v_type) {
 	case VDIR:
 		printf(" mountedhere %p\n", vp->v_mountedhere);
 		break;
 	case VCHR:
 		printf(" rdev %p\n", vp->v_rdev);
 		break;
 	case VSOCK:
 		printf(" socket %p\n", vp->v_unpcb);
 		break;
 	case VFIFO:
 		printf(" fifoinfo %p\n", vp->v_fifoinfo);
 		break;
 	default:
 		printf("\n");
 		break;
 	}
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (holdcnt & VHOLD_NO_SMR)
 		strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
 	printf("    hold count flags (%s)\n", buf + 1);
 
 	buf[0] = '\0';
 	buf[1] = '\0';
 	irflag = vn_irflag_read(vp);
 	if (irflag & VIRF_DOOMED)
 		strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
 	if (irflag & VIRF_PGREAD)
 		strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
 	if (irflag & VIRF_MOUNTPOINT)
 		strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf));
 	if (irflag & VIRF_TEXT_REF)
 		strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf));
 	flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_vflag & VV_ROOT)
 		strlcat(buf, "|VV_ROOT", sizeof(buf));
 	if (vp->v_vflag & VV_ISTTY)
 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
 	if (vp->v_vflag & VV_NOSYNC)
 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 	if (vp->v_vflag & VV_ETERNALDEV)
 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 	if (vp->v_vflag & VV_CACHEDLABEL)
 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 	if (vp->v_vflag & VV_VMSIZEVNLOCK)
 		strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
 	if (vp->v_vflag & VV_COPYONWRITE)
 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 	if (vp->v_vflag & VV_PROCDEP)
 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 	if (vp->v_vflag & VV_DELETED)
 		strlcat(buf, "|VV_DELETED", sizeof(buf));
 	if (vp->v_vflag & VV_MD)
 		strlcat(buf, "|VV_MD", sizeof(buf));
 	if (vp->v_vflag & VV_FORCEINSMQ)
 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 	if (vp->v_vflag & VV_READLINK)
 		strlcat(buf, "|VV_READLINK", sizeof(buf));
 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
 	    VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM |
 	    VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
 	if (vp->v_iflag & VI_DOINGINACT)
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 	if (vp->v_iflag & VI_DEFINACT)
 		strlcat(buf, "|VI_DEFINACT", sizeof(buf));
 	if (vp->v_iflag & VI_FOPENING)
 		strlcat(buf, "|VI_FOPENING", sizeof(buf));
 	flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT |
 	    VI_OWEINACT | VI_DEFINACT | VI_FOPENING);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_mflag & VMP_LAZYLIST)
 		strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
 	flags = vp->v_mflag & ~(VMP_LAZYLIST);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	printf("    flags (%s)", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	printf("\n");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d "
 		    "cleanbuf %d dirtybuf %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count,
 		    vp->v_bufobj.bo_clean.bv_cnt,
 		    vp->v_bufobj.bo_dirty.bv_cnt);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE)
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	db_printf("Locked vnodes\n");
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 				vn_printf(vp, "vnode ");
 		}
 	}
 }
 
 /*
  * Show details about the given vnode.
  */
 DB_SHOW_COMMAND(vnode, db_show_vnode)
 {
 	struct vnode *vp;
 
 	if (!have_addr)
 		return;
 	vp = (struct vnode *)addr;
 	vn_printf(vp, "vnode ");
 }
 
 /*
  * Show details about the given mount point.
  */
 DB_SHOW_COMMAND(mount, db_show_mount)
 {
 	struct mount *mp;
 	struct vfsopt *opt;
 	struct statfs *sp;
 	struct vnode *vp;
 	char buf[512];
 	uint64_t mflags;
 	u_int flags;
 
 	if (!have_addr) {
 		/* No address given, print short info about all mount points. */
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			db_printf("%p %s on %s (%s)\n", mp,
 			    mp->mnt_stat.f_mntfromname,
 			    mp->mnt_stat.f_mntonname,
 			    mp->mnt_stat.f_fstypename);
 			if (db_pager_quit)
 				break;
 		}
 		db_printf("\nMore info: show mount <addr>\n");
 		return;
 	}
 
 	mp = (struct mount *)addr;
 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
 
 	buf[0] = '\0';
 	mflags = mp->mnt_flag;
 #define	MNT_FLAG(flag)	do {						\
 	if (mflags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
 		mflags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_FLAG(MNT_RDONLY);
 	MNT_FLAG(MNT_SYNCHRONOUS);
 	MNT_FLAG(MNT_NOEXEC);
 	MNT_FLAG(MNT_NOSUID);
 	MNT_FLAG(MNT_NFS4ACLS);
 	MNT_FLAG(MNT_UNION);
 	MNT_FLAG(MNT_ASYNC);
 	MNT_FLAG(MNT_SUIDDIR);
 	MNT_FLAG(MNT_SOFTDEP);
 	MNT_FLAG(MNT_NOSYMFOLLOW);
 	MNT_FLAG(MNT_GJOURNAL);
 	MNT_FLAG(MNT_MULTILABEL);
 	MNT_FLAG(MNT_ACLS);
 	MNT_FLAG(MNT_NOATIME);
 	MNT_FLAG(MNT_NOCLUSTERR);
 	MNT_FLAG(MNT_NOCLUSTERW);
 	MNT_FLAG(MNT_SUJ);
 	MNT_FLAG(MNT_EXRDONLY);
 	MNT_FLAG(MNT_EXPORTED);
 	MNT_FLAG(MNT_DEFEXPORTED);
 	MNT_FLAG(MNT_EXPORTANON);
 	MNT_FLAG(MNT_EXKERB);
 	MNT_FLAG(MNT_EXPUBLIC);
 	MNT_FLAG(MNT_LOCAL);
 	MNT_FLAG(MNT_QUOTA);
 	MNT_FLAG(MNT_ROOTFS);
 	MNT_FLAG(MNT_USER);
 	MNT_FLAG(MNT_IGNORE);
 	MNT_FLAG(MNT_UPDATE);
 	MNT_FLAG(MNT_DELEXPORT);
 	MNT_FLAG(MNT_RELOAD);
 	MNT_FLAG(MNT_FORCE);
 	MNT_FLAG(MNT_SNAPSHOT);
 	MNT_FLAG(MNT_BYFSID);
 #undef MNT_FLAG
 	if (mflags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%016jx", mflags);
 	}
 	db_printf("    mnt_flag = %s\n", buf);
 
 	buf[0] = '\0';
 	flags = mp->mnt_kern_flag;
 #define	MNT_KERN_FLAG(flag)	do {					\
 	if (flags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
 		flags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
 	MNT_KERN_FLAG(MNTK_ASYNC);
 	MNT_KERN_FLAG(MNTK_SOFTDEP);
 	MNT_KERN_FLAG(MNTK_NOMSYNC);
 	MNT_KERN_FLAG(MNTK_DRAINING);
 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 	MNT_KERN_FLAG(MNTK_NO_IOPF);
 	MNT_KERN_FLAG(MNTK_RECURSE);
 	MNT_KERN_FLAG(MNTK_UPPER_WAITER);
 	MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE);
 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
 	MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG);
 	MNT_KERN_FLAG(MNTK_FPLOOKUP);
 	MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER);
 	MNT_KERN_FLAG(MNTK_NOASYNC);
 	MNT_KERN_FLAG(MNTK_UNMOUNT);
 	MNT_KERN_FLAG(MNTK_MWAIT);
 	MNT_KERN_FLAG(MNTK_SUSPEND);
 	MNT_KERN_FLAG(MNTK_SUSPEND2);
 	MNT_KERN_FLAG(MNTK_SUSPENDED);
 	MNT_KERN_FLAG(MNTK_NULL_NOCACHE);
 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 #undef MNT_KERN_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%08x", flags);
 	}
 	db_printf("    mnt_kern_flag = %s\n", buf);
 
 	db_printf("    mnt_opt = ");
 	opt = TAILQ_FIRST(mp->mnt_opt);
 	if (opt != NULL) {
 		db_printf("%s", opt->name);
 		opt = TAILQ_NEXT(opt, link);
 		while (opt != NULL) {
 			db_printf(", %s", opt->name);
 			opt = TAILQ_NEXT(opt, link);
 		}
 	}
 	db_printf("\n");
 
 	sp = &mp->mnt_stat;
 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
 
 	db_printf("    mnt_cred = { uid=%u ruid=%u",
 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
 	if (jailed(mp->mnt_cred))
 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
 	db_printf(" }\n");
 	db_printf("    mnt_ref = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
 	db_printf("    mnt_lazyvnodelistsize = %d\n",
 	    mp->mnt_lazyvnodelistsize);
 	db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
 	db_printf("    mnt_lockref = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 	db_printf("    mnt_secondary_accwrites = %d\n",
 	    mp->mnt_secondary_accwrites);
 	db_printf("    mnt_gjprovider = %s\n",
 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
 	db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
 
 	db_printf("\n\nList of active vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 	db_printf("\n\nList of inactive vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 #endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static int
 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp.vfc_vfsops = NULL;
 	xvfsp.vfc_next = NULL;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 
 #ifdef COMPAT_FREEBSD32
 struct xvfsconf32 {
 	uint32_t	vfc_vfsops;
 	char		vfc_name[MFSNAMELEN];
 	int32_t		vfc_typenum;
 	int32_t		vfc_refcount;
 	int32_t		vfc_flags;
 	uint32_t	vfc_next;
 };
 
 static int
 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf32 xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	int error;
 
 	error = 0;
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			error = vfsconf2x32(req, vfsp);
 		else
 #endif
 			error = vfsconf2x(req, vfsp);
 		if (error)
 			break;
 	}
 	vfsconf_sunlock();
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 	log(LOG_WARNING, "userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		vfsconf_slock();
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		}
 		vfsconf_sunlock();
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			return (vfsconf2x32(req, vfsp));
 		else
 #endif
 			return (vfsconf2x(req, vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, vfs_sysctl,
     "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error != 0) {
 			vfsconf_sunlock();
 			return (error);
 		}
 	}
 	vfsconf_sunlock();
 	return (0);
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
-#define KINFO_VNODESLOP		10
-#ifdef notyet
-/*
- * Dump vnode list (via sysctl).
- */
-/* ARGSUSED */
-static int
-sysctl_vnode(SYSCTL_HANDLER_ARGS)
-{
-	struct xvnode *xvn;
-	struct mount *mp;
-	struct vnode *vp;
-	int error, len, n;
-
-	/*
-	 * Stale numvnodes access is not fatal here.
-	 */
-	req->lock = 0;
-	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
-	if (!req->oldptr)
-		/* Make an estimate */
-		return (SYSCTL_OUT(req, 0, len));
-
-	error = sysctl_wire_old_buffer(req, 0);
-	if (error != 0)
-		return (error);
-	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
-	n = 0;
-	mtx_lock(&mountlist_mtx);
-	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
-		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
-			continue;
-		MNT_ILOCK(mp);
-		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
-			if (n == len)
-				break;
-			vref(vp);
-			xvn[n].xv_size = sizeof *xvn;
-			xvn[n].xv_vnode = vp;
-			xvn[n].xv_id = 0;	/* XXX compat */
-#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
-			XV_COPY(usecount);
-			XV_COPY(writecount);
-			XV_COPY(holdcnt);
-			XV_COPY(mount);
-			XV_COPY(numoutput);
-			XV_COPY(type);
-#undef XV_COPY
-			xvn[n].xv_flag = vp->v_vflag;
-
-			switch (vp->v_type) {
-			case VREG:
-			case VDIR:
-			case VLNK:
-				break;
-			case VBLK:
-			case VCHR:
-				if (vp->v_rdev == NULL) {
-					vrele(vp);
-					continue;
-				}
-				xvn[n].xv_dev = dev2udev(vp->v_rdev);
-				break;
-			case VSOCK:
-				xvn[n].xv_socket = vp->v_socket;
-				break;
-			case VFIFO:
-				xvn[n].xv_fifo = vp->v_fifoinfo;
-				break;
-			case VNON:
-			case VBAD:
-			default:
-				/* shouldn't happen? */
-				vrele(vp);
-				continue;
-			}
-			vrele(vp);
-			++n;
-		}
-		MNT_IUNLOCK(mp);
-		mtx_lock(&mountlist_mtx);
-		vfs_unbusy(mp);
-		if (n == len)
-			break;
-	}
-	mtx_unlock(&mountlist_mtx);
-
-	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
-	free(xvn, M_TEMP);
-	return (error);
-}
-
-SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
-    CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
-    "");
-#endif
-
 static void
 unmount_or_warn(struct mount *mp)
 {
 	int error;
 
 	error = dounmount(mp, MNT_FORCE, curthread);
 	if (error != 0) {
 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 		if (error == EBUSY)
 			printf("BUSY)\n");
 		else
 			printf("%d)\n", error);
 	}
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp, *tmp;
 
 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
 
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
 		vfs_ref(mp);
 
 		/*
 		 * Forcibly unmounting "/dev" before "/" would prevent clean
 		 * unmount of the latter.
 		 */
 		if (mp == rootdevmp)
 			continue;
 
 		unmount_or_warn(mp);
 	}
 
 	if (rootdevmp != NULL)
 		unmount_or_warn(rootdevmp);
 }
 
 static void
 vfs_deferred_inactive(struct vnode *vp, int lkflags)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set"));
 	if ((vp->v_iflag & VI_OWEINACT) == 0) {
 		vdropl(vp);
 		return;
 	}
 	if (vn_lock(vp, lkflags) == 0) {
 		VI_LOCK(vp);
 		vinactive(vp);
 		VOP_UNLOCK(vp);
 		vdropl(vp);
 		return;
 	}
 	vdefer_inactive_unlocked(vp);
 }
 
 static int
 vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
 {
 
 	return (vp->v_iflag & VI_DEFINACT);
 }
 
 static void __noinline
 vfs_periodic_inactive(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	int lkflags;
 
 	lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (flags != MNT_WAIT)
 		lkflags |= LK_NOWAIT;
 
 	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
 		if ((vp->v_iflag & VI_DEFINACT) == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		vp->v_iflag &= ~VI_DEFINACT;
 		vfs_deferred_inactive(vp, lkflags);
 	}
 }
 
 static inline bool
 vfs_want_msync(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	/*
 	 * This test may be performed without any locks held.
 	 * We rely on vm_object's type stability.
 	 */
 	if (vp->v_vflag & VV_NOSYNC)
 		return (false);
 	obj = vp->v_object;
 	return (obj != NULL && vm_object_mightbedirty(obj));
 }
 
 static int
 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
 {
 
 	if (vp->v_vflag & VV_NOSYNC)
 		return (false);
 	if (vp->v_iflag & VI_DEFINACT)
 		return (true);
 	return (vfs_want_msync(vp));
 }
 
 static void __noinline
 vfs_periodic_msync_inactive(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 	int lkflags, objflags;
 	bool seen_defer;
 
 	lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (flags != MNT_WAIT) {
 		lkflags |= LK_NOWAIT;
 		objflags = OBJPC_NOSYNC;
 	} else {
 		objflags = OBJPC_SYNC;
 	}
 
 	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
 		seen_defer = false;
 		if (vp->v_iflag & VI_DEFINACT) {
 			vp->v_iflag &= ~VI_DEFINACT;
 			seen_defer = true;
 		}
 		if (!vfs_want_msync(vp)) {
 			if (seen_defer)
 				vfs_deferred_inactive(vp, lkflags);
 			else
 				VI_UNLOCK(vp);
 			continue;
 		}
 		if (vget(vp, lkflags) == 0) {
 			obj = vp->v_object;
 			if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
 				VM_OBJECT_WLOCK(obj);
 				vm_object_page_clean(obj, 0, 0, objflags);
 				VM_OBJECT_WUNLOCK(obj);
 			}
 			vput(vp);
 			if (seen_defer)
 				vdrop(vp);
 		} else {
 			if (seen_defer)
 				vdefer_inactive_unlocked(vp);
 		}
 	}
 }
 
 void
 vfs_periodic(struct mount *mp, int flags)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
 		vfs_periodic_inactive(mp, flags);
 	else
 		vfs_periodic_msync_inactive(mp, flags);
 }
 
 static void
 destroy_vpollinfo_free(struct vpollinfo *vi)
 {
 
 	knlist_destroy(&vi->vpi_selinfo.si_note);
 	mtx_destroy(&vi->vpi_lock);
 	free(vi, M_VNODEPOLL);
 }
 
 static void
 destroy_vpollinfo(struct vpollinfo *vi)
 {
 
 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
 	seldrain(&vi->vpi_selinfo);
 	destroy_vpollinfo_free(vi);
 }
 
 /*
  * Initialize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	if (vp->v_pollinfo != NULL)
 		return;
 	vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knl_assert_lock);
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
 		destroy_vpollinfo_free(vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	VI_UNLOCK(vp);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return (events);
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return (0);
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock1 =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 void
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
 	vp->v_type = VNON;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_FORCEINSMQ;
 	error = insmntque1(vp, mp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: insmntque() failed");
 	vp->v_vflag &= ~VV_FORCEINSMQ;
 	vn_set_state(vp, VSTATE_CONSTRUCTED);
 	VOP_UNLOCK(vp);
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	if (mp->mnt_syncer == NULL) {
 		mp->mnt_syncer = vp;
 		vp = NULL;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 	if (vp != NULL) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vgone(vp);
 		vput(vp);
 	}
 }
 
 void
 vfs_deallocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_lock(&sync_mtx);
 	vp = mp->mnt_syncer;
 	if (vp != NULL)
 		mp->mnt_syncer = NULL;
 	mtx_unlock(&sync_mtx);
 	if (vp != NULL)
 		vrele(vp);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	int error, save;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
 		return (0);
 	VOP_UNLOCK(syncvp);
 	save = curthread_pflags_set(TDP_SYNCIO);
 	/*
 	 * The filesystem at hand may be idle with free vnodes stored in the
 	 * batch.  Return them instead of letting them stay there indefinitely.
 	 */
 	vfs_periodic(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY);
 	curthread_pflags_restore(save);
 	vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY);
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	mtx_lock(&sync_mtx);
 	if (vp->v_mount->mnt_syncer == vp)
 		vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		sync_vnode_count--;
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 
 	return (0);
 }
 
 int
 vn_need_pageq_flush(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	obj = vp->v_object;
 	return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 	    vm_object_mightbedirty(obj));
 }
 
 /*
  * Check if vnode represents a disk device
  */
 bool
 vn_isdisk_error(struct vnode *vp, int *errp)
 {
 	int error;
 
 	if (vp->v_type != VCHR) {
 		error = ENOTBLK;
 		goto out;
 	}
 	error = 0;
 	dev_lock();
 	if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 out:
 	*errp = error;
 	return (error == 0);
 }
 
 bool
 vn_isdisk(struct vnode *vp)
 {
 	int error;
 
 	return (vn_isdisk_error(vp, &error));
 }
 
 /*
  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
  * the comment above cache_fplookup for details.
  */
 int
 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
 {
 	int error;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		if (file_mode & S_IXUSR)
 			return (0);
 		goto out_error;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			return (0);
 		goto out_error;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		return (0);
 out_error:
 	/*
 	 * Permission check failed, but it is possible denial will get overwritten
 	 * (e.g., when root is traversing through a 700 directory owned by someone
 	 * else).
 	 *
 	 * vaccess() calls priv_check_cred which in turn can descent into MAC
 	 * modules overriding this result. It's quite unclear what semantics
 	 * are allowed for them to operate, thus for safety we don't call them
 	 * from within the SMR section. This also means if any such modules
 	 * are present, we have to let the regular lookup decide.
 	 */
 	error = priv_check_cred_vfs_lookup_nomac(cred);
 	switch (error) {
 	case 0:
 		return (0);
 	case EAGAIN:
 		/*
 		 * MAC modules present.
 		 */
 		return (EAGAIN);
 	case EPERM:
 		return (EACCES);
 	default:
 		return (error);
 	}
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, and credentials.
  * Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     accmode_t accmode, struct ucred *cred)
 {
 	accmode_t dac_granted;
 	accmode_t priv_granted;
 
 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 	    ("invalid bit in accmode"));
 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 	    ("VAPPEND without VWRITE"));
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((accmode & dac_granted) == accmode)
 		return (0);
 
 privcheck:
 	/*
 	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.  For each privilege, if the privilege is required,
 	 * bitwise or the request type onto the priv_granted mask.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP))
 			priv_granted |= VEXEC;
 	} else {
 		/*
 		 * Ensure that at least one execute bit is on. Otherwise,
 		 * a privileged user will always succeed, and we don't want
 		 * this to happen unless the file really is executable.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC))
 			priv_granted |= VEXEC;
 	}
 
 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_READ))
 		priv_granted |= VREAD;
 
 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN))
 		priv_granted |= VADMIN;
 
 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
 		return (0);
 	}
 
 	return ((accmode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, accmode_t accmode)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, accmode, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
     "Drop into debugger on lock violation");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
     0, "Check for interlock across VOPs");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
     0, "Print lock violations");
 
 int vfs_badlock_vnode = 1;	/* Print vnode details on lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
     0, "Print vnode details on lock violations");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_vnode)
 		vn_printf(vp, "vnode ");
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 	int locked;
 
 	if (KERNEL_PANICKED() || vp == NULL)
 		return;
 
 	locked = VOP_ISLOCKED(vp);
 	if (locked == 0 || locked == LK_EXCLOTHER)
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 	if (KERNEL_PANICKED() || vp == NULL)
 		return;
 
 	if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 	if (KERNEL_PANICKED() || vp == NULL)
 		return;
 
 	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_fail(struct vop_rename_args *ap)
 {
 
 	if (ap->a_tvp != NULL)
 		vput(ap->a_tvp);
 	if (ap->a_tdvp == ap->a_tvp)
 		vrele(ap->a_tdvp);
 	else
 		vput(ap->a_tdvp);
 	vrele(ap->a_fdvp);
 	vrele(ap->a_fvp);
 }
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	/*
 	 * It may be tempting to add vn_seqc_write_begin/end calls here and
 	 * in vop_rename_post but that's not going to work out since some
 	 * filesystems relookup vnodes mid-rename. This is probably a bug.
 	 *
 	 * For now filesystems are expected to do the relevant calls after they
 	 * decide what vnodes to operate on.
 	 */
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 #ifdef DEBUG_VFS_LOCKS
 void
 vop_fplookup_vexec_debugpre(void *ap __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 void
 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 void
 vop_fplookup_symlink_debugpre(void *ap __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 void
 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 static void
 vop_fsync_debugprepost(struct vnode *vp, const char *name)
 {
 	if (vp->v_type == VCHR)
 		;
 	else if (MNT_EXTENDED_SHARED(vp->v_mount))
 		ASSERT_VOP_LOCKED(vp, name);
 	else
 		ASSERT_VOP_ELOCKED(vp, name);
 }
 
 void
 vop_fsync_debugpre(void *a)
 {
 	struct vop_fsync_args *ap;
 
 	ap = a;
 	vop_fsync_debugprepost(ap->a_vp, "fsync");
 }
 
 void
 vop_fsync_debugpost(void *a, int rc __unused)
 {
 	struct vop_fsync_args *ap;
 
 	ap = a;
 	vop_fsync_debugprepost(ap->a_vp, "fsync");
 }
 
 void
 vop_fdatasync_debugpre(void *a)
 {
 	struct vop_fdatasync_args *ap;
 
 	ap = a;
 	vop_fsync_debugprepost(ap->a_vp, "fsync");
 }
 
 void
 vop_fdatasync_debugpost(void *a, int rc __unused)
 {
 	struct vop_fdatasync_args *ap;
 
 	ap = a;
 	vop_fsync_debugprepost(ap->a_vp, "fsync");
 }
 
 void
 vop_strategy_debugpre(void *ap)
 {
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 	}
 }
 
 void
 vop_lock_debugpre(void *ap)
 {
 	struct vop_lock1_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_lock_debugpost(void *ap, int rc)
 {
 	struct vop_lock1_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_unlock_debugpre(void *ap)
 {
 	struct vop_unlock_args *a = ap;
 	struct vnode *vp = a->a_vp;
 
 	VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp);
 	ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK");
 }
 
 void
 vop_need_inactive_debugpre(void *ap)
 {
 	struct vop_need_inactive_args *a = ap;
 
 	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 }
 
 void
 vop_need_inactive_debugpost(void *ap, int rc)
 {
 	struct vop_need_inactive_args *a = ap;
 
 	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 }
 #endif
 
 void
 vop_create_pre(void *ap)
 {
 	struct vop_create_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_whiteout_pre(void *ap)
 {
 	struct vop_whiteout_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_whiteout_post(void *ap, int rc)
 {
 	struct vop_whiteout_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 }
 
 void
 vop_deleteextattr_pre(void *ap)
 {
 	struct vop_deleteextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_deleteextattr_post(void *ap, int rc)
 {
 	struct vop_deleteextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_link_pre(void *ap)
 {
 	struct vop_link_args *a;
 	struct vnode *vp, *tdvp;
 
 	a = ap;
 	vp = a->a_vp;
 	tdvp = a->a_tdvp;
 	vn_seqc_write_begin(vp);
 	vn_seqc_write_begin(tdvp);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a;
 	struct vnode *vp, *tdvp;
 
 	a = ap;
 	vp = a->a_vp;
 	tdvp = a->a_tdvp;
 	vn_seqc_write_end(vp);
 	vn_seqc_write_end(tdvp);
 	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_pre(void *ap)
 {
 	struct vop_mkdir_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 #ifdef DEBUG_VFS_LOCKS
 void
 vop_mkdir_debugpost(void *ap, int rc)
 {
 	struct vop_mkdir_args *a;
 
 	a = ap;
 	if (!rc)
 		cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp);
 }
 #endif
 
 void
 vop_mknod_pre(void *ap)
 {
 	struct vop_mknod_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_reclaim_post(void *ap, int rc)
 {
 	struct vop_reclaim_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	ASSERT_VOP_IN_SEQC(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
 }
 
 void
 vop_remove_pre(void *ap)
 {
 	struct vop_remove_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_begin(dvp);
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_end(dvp);
 	vn_seqc_write_end(vp);
 	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 	long hint;
 
 	if (!rc) {
 		hint = NOTE_WRITE;
 		if (a->a_fdvp == a->a_tdvp) {
 			if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		} else {
 			hint |= NOTE_EXTEND;
 			if (a->a_fvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 
 			if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
 			    a->a_tvp->v_type == VDIR)
 				hint &= ~NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		}
 
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_pre(void *ap)
 {
 	struct vop_rmdir_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_begin(dvp);
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_end(dvp);
 	vn_seqc_write_end(vp);
 	if (!rc) {
 		vp->v_vflag |= VV_UNLINKED;
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_pre(void *ap)
 {
 	struct vop_setattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 }
 
 void
 vop_setacl_pre(void *ap)
 {
 	struct vop_setacl_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setacl_post(void *ap, int rc __unused)
 {
 	struct vop_setacl_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 }
 
 void
 vop_setextattr_pre(void *ap)
 {
 	struct vop_setextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setextattr_post(void *ap, int rc)
 {
 	struct vop_setextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_pre(void *ap)
 {
 	struct vop_symlink_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_open_post(void *ap, int rc)
 {
 	struct vop_open_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
 }
 
 void
 vop_close_post(void *ap, int rc)
 {
 	struct vop_close_args *a = ap;
 
 	if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
 	    !VN_IS_DOOMED(a->a_vp))) {
 		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
 	}
 }
 
 void
 vop_read_post(void *ap, int rc)
 {
 	struct vop_read_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 void
 vop_read_pgcache_post(void *ap, int rc)
 {
 	struct vop_read_pgcache_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
 }
 
 void
 vop_readdir_post(void *ap, int rc)
 {
 	struct vop_readdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init_mtx(&fs_knlist, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_fsattach,
 	.f_detach = filt_fsdetach,
 	.f_event = filt_fsevent
 };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= kn->kn_sfflags & hint;
 
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	error = VFS_SYSCTL(mp, vc.vc_op, req);
 	vfs_rel(mp);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
     NULL, 0, sysctl_vfs_ctl, "",
     "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsread
 };
 static struct filterops vfswrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfswrite
 };
 static struct filterops vfsvnode_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsvnode
 };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp);
 }
 
 static void
 vfs_knl_assert_lock(void *arg, int what)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	if (what == LA_LOCKED)
 		ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
 	else
 		ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
 #endif
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl;
 
 	KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ &&
 	    kn->kn_filter != EVFILT_WRITE),
 	    ("READ/WRITE filter on a FIFO leaked through"));
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	vhold(vp);
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 	vdrop(vp);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	off_t size;
 	int res;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		VI_LOCK(vp);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		VI_UNLOCK(vp);
 		return (1);
 	}
 
 	if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0)
 		return (0);
 
 	VI_LOCK(vp);
 	kn->kn_data = size - kn->kn_fp->f_offset;
 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	VI_LOCK(vp);
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	VI_UNLOCK(vp);
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	int res;
 
 	VI_LOCK(vp);
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		kn->kn_flags |= EV_EOF;
 		VI_UNLOCK(vp);
 		return (1);
 	}
 	res = (kn->kn_fflags != 0);
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*
  * Returns whether the directory is empty or not.
  * If it is empty, the return value is 0; otherwise
  * the return value is an error value (which may
  * be ENOTEMPTY).
  */
 int
 vfs_emptydir(struct vnode *vp)
 {
 	struct uio uio;
 	struct iovec iov;
 	struct dirent *dirent, *dp, *endp;
 	int error, eof;
 
 	error = 0;
 	eof = 0;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
 	VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory"));
 
 	dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
 	iov.iov_base = dirent;
 	iov.iov_len = sizeof(struct dirent);
 
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = sizeof(struct dirent);
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 
 	while (eof == 0 && error == 0) {
 		error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
 		    NULL, NULL);
 		if (error != 0)
 			break;
 		endp = (void *)((uint8_t *)dirent +
 		    sizeof(struct dirent) - uio.uio_resid);
 		for (dp = dirent; dp < endp;
 		     dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
 			if (dp->d_type == DT_WHT)
 				continue;
 			if (dp->d_namlen == 0)
 				continue;
 			if (dp->d_type != DT_DIR &&
 			    dp->d_type != DT_UNKNOWN) {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen > 2) {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen == 1 &&
 			    dp->d_name[0] != '.') {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen == 2 &&
 			    dp->d_name[1] != '.') {
 				error = ENOTEMPTY;
 				break;
 			}
 			uio.uio_resid = sizeof(struct dirent);
 		}
 	}
 	free(dirent, M_TEMP);
 	return (error);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	*ap->a_ncookies += 1;
 	return (0);
 }
 
 /*
  * The purpose of this routine is to remove granularity from accmode_t,
  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
  * VADMIN and VAPPEND.
  *
  * If it returns 0, the caller is supposed to continue with the usual
  * access checks using 'accmode' as modified by this routine.  If it
  * returns nonzero value, the caller is supposed to return that value
  * as errno.
  *
  * Note that after this routine runs, accmode may be zero.
  */
 int
 vfs_unixify_accmode(accmode_t *accmode)
 {
 	/*
 	 * There is no way to specify explicit "deny" rule using
 	 * file mode or POSIX.1e ACLs.
 	 */
 	if (*accmode & VEXPLICIT_DENY) {
 		*accmode = 0;
 		return (0);
 	}
 
 	/*
 	 * None of these can be translated into usual access bits.
 	 * Also, the common case for NFSv4 ACLs is to not contain
 	 * either of these bits. Caller should check for VWRITE
 	 * on the containing directory instead.
 	 */
 	if (*accmode & (VDELETE_CHILD | VDELETE))
 		return (EPERM);
 
 	if (*accmode & VADMIN_PERMS) {
 		*accmode &= ~VADMIN_PERMS;
 		*accmode |= VADMIN;
 	}
 
 	/*
 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
 	 */
 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
 
 	return (0);
 }
 
 /*
  * Clear out a doomed vnode (if any) and replace it with a new one as long
  * as the fs is not being unmounted. Return the root vnode to the caller.
  */
 static int __noinline
 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 restart:
 	if (mp->mnt_rootvnode != NULL) {
 		MNT_ILOCK(mp);
 		vp = mp->mnt_rootvnode;
 		if (vp != NULL) {
 			if (!VN_IS_DOOMED(vp)) {
 				vrefact(vp);
 				MNT_IUNLOCK(mp);
 				error = vn_lock(vp, flags);
 				if (error == 0) {
 					*vpp = vp;
 					return (0);
 				}
 				vrele(vp);
 				goto restart;
 			}
 			/*
 			 * Clear the old one.
 			 */
 			mp->mnt_rootvnode = NULL;
 		}
 		MNT_IUNLOCK(mp);
 		if (vp != NULL) {
 			vfs_op_barrier_wait(mp);
 			vrele(vp);
 		}
 	}
 	error = VFS_CACHEDROOT(mp, flags, vpp);
 	if (error != 0)
 		return (error);
 	if (mp->mnt_vfs_ops == 0) {
 		MNT_ILOCK(mp);
 		if (mp->mnt_vfs_ops != 0) {
 			MNT_IUNLOCK(mp);
 			return (0);
 		}
 		if (mp->mnt_rootvnode == NULL) {
 			vrefact(*vpp);
 			mp->mnt_rootvnode = *vpp;
 		} else {
 			if (mp->mnt_rootvnode != *vpp) {
 				if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
 					panic("%s: mismatch between vnode returned "
 					    " by VFS_CACHEDROOT and the one cached "
 					    " (%p != %p)",
 					    __func__, *vpp, mp->mnt_rootvnode);
 				}
 			}
 		}
 		MNT_IUNLOCK(mp);
 	}
 	return (0);
 }
 
 int
 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mount_pcpu *mpcpu;
 	struct vnode *vp;
 	int error;
 
 	if (!vfs_op_thread_enter(mp, mpcpu))
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	vp = atomic_load_ptr(&mp->mnt_rootvnode);
 	if (vp == NULL || VN_IS_DOOMED(vp)) {
 		vfs_op_thread_exit(mp, mpcpu);
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	}
 	vrefact(vp);
 	vfs_op_thread_exit(mp, mpcpu);
 	error = vn_lock(vp, flags);
 	if (error != 0) {
 		vrele(vp);
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	}
 	*vpp = vp;
 	return (0);
 }
 
 struct vnode *
 vfs_cache_root_clear(struct mount *mp)
 {
 	struct vnode *vp;
 
 	/*
 	 * ops > 0 guarantees there is nobody who can see this vnode
 	 */
 	MPASS(mp->mnt_vfs_ops > 0);
 	vp = mp->mnt_rootvnode;
 	if (vp != NULL)
 		vn_seqc_write_begin(vp);
 	mp->mnt_rootvnode = NULL;
 	return (vp);
 }
 
 void
 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
 {
 
 	MPASS(mp->mnt_vfs_ops > 0);
 	vrefact(vp);
 	mp->mnt_rootvnode = vp;
 }
 
 /*
  * These are helper functions for filesystems to traverse all
  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
  *
  * This interface replaces MNT_VNODE_FOREACH.
  */
 
 struct vnode *
 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	maybe_yield();
 	MNT_ILOCK(mp);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
 	    vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
 		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 			continue;
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		__mnt_vnode_markerfree_all(mvp, mp);
 		/* MNT_IUNLOCK(mp); -- done in above function */
 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
 		return (NULL);
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = vn_alloc_marker(mp);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 			continue;
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		vn_free_marker(*mvp);
 		*mvp = NULL;
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 void
 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	vn_free_marker(*mvp);
 	*mvp = NULL;
 }
 
 /*
  * These are helper functions for filesystems to traverse their
  * lazy vnodes.  See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
  */
 static void
 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 {
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	vn_free_marker(*mvp);
 	*mvp = NULL;
 }
 
 /*
  * Relock the mp mount vnode list lock with the vp vnode interlock in the
  * conventional lock order during mnt_vnode_next_lazy iteration.
  *
  * On entry, the mount vnode list lock is held and the vnode interlock is not.
  * The list lock is dropped and reacquired.  On success, both locks are held.
  * On failure, the mount vnode list lock is held but the vnode interlock is
  * not, and the procedure may have yielded.
  */
 static bool
 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp,
     struct vnode *vp)
 {
 
 	VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
 	    TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp,
 	    ("%s: bad marker", __func__));
 	VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
 	    ("%s: inappropriate vnode", __func__));
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist);
 	TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist);
 
 	/*
 	 * Note we may be racing against vdrop which transitioned the hold
 	 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine,
 	 * if we are the only user after we get the interlock we will just
 	 * vdrop.
 	 */
 	vhold(vp);
 	mtx_unlock(&mp->mnt_listmtx);
 	VI_LOCK(vp);
 	if (VN_IS_DOOMED(vp)) {
 		VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 		goto out_lost;
 	}
 	VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 	/*
 	 * There is nothing to do if we are the last user.
 	 */
 	if (!refcount_release_if_not_last(&vp->v_holdcnt))
 		goto out_lost;
 	mtx_lock(&mp->mnt_listmtx);
 	return (true);
 out_lost:
 	vdropl(vp);
 	maybe_yield();
 	mtx_lock(&mp->mnt_listmtx);
 	return (false);
 }
 
 static struct vnode *
 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 	struct vnode *vp;
 
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 restart:
 	vp = TAILQ_NEXT(*mvp, v_lazylist);
 	while (vp != NULL) {
 		if (vp->v_type == VMARKER) {
 			vp = TAILQ_NEXT(vp, v_lazylist);
 			continue;
 		}
 		/*
 		 * See if we want to process the vnode. Note we may encounter a
 		 * long string of vnodes we don't care about and hog the list
 		 * as a result. Check for it and requeue the marker.
 		 */
 		VNPASS(!VN_IS_DOOMED(vp), vp);
 		if (!cb(vp, cbarg)) {
 			if (!should_yield()) {
 				vp = TAILQ_NEXT(vp, v_lazylist);
 				continue;
 			}
 			TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp,
 			    v_lazylist);
 			TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp,
 			    v_lazylist);
 			mtx_unlock(&mp->mnt_listmtx);
 			kern_yield(PRI_USER);
 			mtx_lock(&mp->mnt_listmtx);
 			goto restart;
 		}
 		/*
 		 * Try-lock because this is the wrong lock order.
 		 */
 		if (!VI_TRYLOCK(vp) &&
 		    !mnt_vnode_next_lazy_relock(*mvp, mp, vp))
 			goto restart;
 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
 		    ("alien vnode on the lazy list %p %p", vp, mp));
 		VNPASS(vp->v_mount == mp, vp);
 		VNPASS(!VN_IS_DOOMED(vp), vp);
 		break;
 	}
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_lazy(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
 	mtx_unlock(&mp->mnt_listmtx);
 	ASSERT_VI_LOCKED(vp, "lazy iter");
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 
 	maybe_yield();
 	mtx_lock(&mp->mnt_listmtx);
 	return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 }
 
 struct vnode *
 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 	struct vnode *vp;
 
 	if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist))
 		return (NULL);
 
 	*mvp = vn_alloc_marker(mp);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 
 	mtx_lock(&mp->mnt_listmtx);
 	vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_lazy(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist);
 	return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 }
 
 void
 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL)
 		return;
 
 	mtx_lock(&mp->mnt_listmtx);
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 	mtx_unlock(&mp->mnt_listmtx);
 	mnt_vnode_markerfree_lazy(mvp, mp);
 }
 
 int
 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
 {
 
 	if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 		cnp->cn_flags &= ~NOEXECCHECK;
 		return (0);
 	}
 
 	return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread));
 }
 
 /*
  * Do not use this variant unless you have means other than the hold count
  * to prevent the vnode from getting freed.
  */
 void
 vn_seqc_write_begin_locked(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vp->v_seqc_users >= 0, vp);
 	vp->v_seqc_users++;
 	if (vp->v_seqc_users == 1)
 		seqc_sleepable_write_begin(&vp->v_seqc);
 }
 
 void
 vn_seqc_write_begin(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vn_seqc_write_begin_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vn_seqc_write_end_locked(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_seqc_users > 0, vp);
 	vp->v_seqc_users--;
 	if (vp->v_seqc_users == 0)
 		seqc_sleepable_write_end(&vp->v_seqc);
 }
 
 void
 vn_seqc_write_end(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vn_seqc_write_end_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Special case handling for allocating and freeing vnodes.
  *
  * The counter remains unchanged on free so that a doomed vnode will
  * keep testing as in modify as long as it is accessible with SMR.
  */
 static void
 vn_seqc_init(struct vnode *vp)
 {
 
 	vp->v_seqc = 0;
 	vp->v_seqc_users = 0;
 }
 
 static void
 vn_seqc_write_end_free(struct vnode *vp)
 {
 
 	VNPASS(seqc_in_modify(vp->v_seqc), vp);
 	VNPASS(vp->v_seqc_users == 1, vp);
 }
 
 void
 vn_irflag_set_locked(struct vnode *vp, short toset)
 {
 	short flags;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	flags = vn_irflag_read(vp);
 	VNASSERT((flags & toset) == 0, vp,
 	    ("%s: some of the passed flags already set (have %d, passed %d)\n",
 	    __func__, flags, toset));
 	atomic_store_short(&vp->v_irflag, flags | toset);
 }
 
 void
 vn_irflag_set(struct vnode *vp, short toset)
 {
 
 	VI_LOCK(vp);
 	vn_irflag_set_locked(vp, toset);
 	VI_UNLOCK(vp);
 }
 
 void
 vn_irflag_set_cond_locked(struct vnode *vp, short toset)
 {
 	short flags;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	flags = vn_irflag_read(vp);
 	atomic_store_short(&vp->v_irflag, flags | toset);
 }
 
 void
 vn_irflag_set_cond(struct vnode *vp, short toset)
 {
 
 	VI_LOCK(vp);
 	vn_irflag_set_cond_locked(vp, toset);
 	VI_UNLOCK(vp);
 }
 
 void
 vn_irflag_unset_locked(struct vnode *vp, short tounset)
 {
 	short flags;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	flags = vn_irflag_read(vp);
 	VNASSERT((flags & tounset) == tounset, vp,
 	    ("%s: some of the passed flags not set (have %d, passed %d)\n",
 	    __func__, flags, tounset));
 	atomic_store_short(&vp->v_irflag, flags & ~tounset);
 }
 
 void
 vn_irflag_unset(struct vnode *vp, short tounset)
 {
 
 	VI_LOCK(vp);
 	vn_irflag_unset_locked(vp, tounset);
 	VI_UNLOCK(vp);
 }
 
 int
 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred)
 {
 	struct vattr vattr;
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 	error = VOP_GETATTR(vp, &vattr, cred);
 	if (__predict_true(error == 0)) {
 		if (vattr.va_size <= OFF_MAX)
 			*size = vattr.va_size;
 		else
 			error = EFBIG;
 	}
 	return (error);
 }
 
 int
 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred)
 {
 	int error;
 
 	VOP_LOCK(vp, LK_SHARED);
 	error = vn_getsize_locked(vp, size, cred);
 	VOP_UNLOCK(vp);
 	return (error);
 }
 
 #ifdef INVARIANTS
 void
 vn_set_state_validate(struct vnode *vp, enum vstate state)
 {
 
 	switch (vp->v_state) {
 	case VSTATE_UNINITIALIZED:
 		switch (state) {
 		case VSTATE_CONSTRUCTED:
 		case VSTATE_DESTROYING:
 			return;
 		default:
 			break;
 		}
 		break;
 	case VSTATE_CONSTRUCTED:
 		ASSERT_VOP_ELOCKED(vp, __func__);
 		switch (state) {
 		case VSTATE_DESTROYING:
 			return;
 		default:
 			break;
 		}
 		break;
 	case VSTATE_DESTROYING:
 		ASSERT_VOP_ELOCKED(vp, __func__);
 		switch (state) {
 		case VSTATE_DEAD:
 			return;
 		default:
 			break;
 		}
 		break;
 	case VSTATE_DEAD:
 		switch (state) {
 		case VSTATE_UNINITIALIZED:
 			return;
 		default:
 			break;
 		}
 		break;
 	}
 
 	vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state);
 	panic("invalid state transition %d -> %d\n", vp->v_state, state);
 }
 #endif
diff --git a/sys/security/audit/audit_bsm_klib.c b/sys/security/audit/audit_bsm_klib.c
index 31a9bf29265d..9dde8362ce9c 100644
--- a/sys/security/audit/audit_bsm_klib.c
+++ b/sys/security/audit/audit_bsm_klib.c
@@ -1,532 +1,531 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1999-2009 Apple Inc.
  * Copyright (c) 2005, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1.  Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  * 3.  Neither the name of Apple Inc. ("Apple") nor the names of
  *     its contributors may be used to endorse or promote products derived
  *     from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sem.h>
 #include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <bsm/audit.h>
 #include <bsm/audit_kevents.h>
 #include <security/audit/audit.h>
 #include <security/audit/audit_private.h>
 
 struct aue_open_event {
 	int		aoe_flags;
 	au_event_t	aoe_event;
 };
 
 static const struct aue_open_event aue_open[] = {
 	{ O_RDONLY,					AUE_OPEN_R },
 	{ (O_RDONLY | O_CREAT),				AUE_OPEN_RC },
 	{ (O_RDONLY | O_CREAT | O_TRUNC),		AUE_OPEN_RTC },
 	{ (O_RDONLY | O_TRUNC),				AUE_OPEN_RT },
 	{ O_RDWR,					AUE_OPEN_RW },
 	{ (O_RDWR | O_CREAT),				AUE_OPEN_RWC },
 	{ (O_RDWR | O_CREAT | O_TRUNC),			AUE_OPEN_RWTC },
 	{ (O_RDWR | O_TRUNC),				AUE_OPEN_RWT },
 	{ O_WRONLY,					AUE_OPEN_W },
 	{ (O_WRONLY | O_CREAT),				AUE_OPEN_WC },
 	{ (O_WRONLY | O_CREAT | O_TRUNC),		AUE_OPEN_WTC },
 	{ (O_WRONLY | O_TRUNC),				AUE_OPEN_WT },
 };
 
 static const struct aue_open_event aue_openat[] = {
 	{ O_RDONLY,					AUE_OPENAT_R },
 	{ (O_RDONLY | O_CREAT),				AUE_OPENAT_RC },
 	{ (O_RDONLY | O_CREAT | O_TRUNC),		AUE_OPENAT_RTC },
 	{ (O_RDONLY | O_TRUNC),				AUE_OPENAT_RT },
 	{ O_RDWR,					AUE_OPENAT_RW },
 	{ (O_RDWR | O_CREAT),				AUE_OPENAT_RWC },
 	{ (O_RDWR | O_CREAT | O_TRUNC),			AUE_OPENAT_RWTC },
 	{ (O_RDWR | O_TRUNC),				AUE_OPENAT_RWT },
 	{ O_WRONLY,					AUE_OPENAT_W },
 	{ (O_WRONLY | O_CREAT),				AUE_OPENAT_WC },
 	{ (O_WRONLY | O_CREAT | O_TRUNC),		AUE_OPENAT_WTC },
 	{ (O_WRONLY | O_TRUNC),				AUE_OPENAT_WT },
 };
 
 static const int aue_msgsys[] = {
 	/* 0 */ AUE_MSGCTL,
 	/* 1 */ AUE_MSGGET,
 	/* 2 */ AUE_MSGSND,
 	/* 3 */ AUE_MSGRCV,
 };
 static const int aue_msgsys_count = sizeof(aue_msgsys) / sizeof(int);
 
 static const int aue_semsys[] = {
 	/* 0 */ AUE_SEMCTL,
 	/* 1 */ AUE_SEMGET,
 	/* 2 */ AUE_SEMOP,
 };
 static const int aue_semsys_count = sizeof(aue_semsys) / sizeof(int);
 
 static const int aue_shmsys[] = {
 	/* 0 */ AUE_SHMAT,
 	/* 1 */ AUE_SHMDT,
 	/* 2 */ AUE_SHMGET,
 	/* 3 */ AUE_SHMCTL,
 };
 static const int aue_shmsys_count = sizeof(aue_shmsys) / sizeof(int);
 
 /*
  * Check whether an event is auditable by comparing the mask of classes this
  * event is part of against the given mask.
  */
 int
 au_preselect(au_event_t event, au_class_t class, au_mask_t *mask_p, int sorf)
 {
 	au_class_t effmask = 0;
 
 	if (mask_p == NULL)
 		return (-1);
 
 	/*
 	 * Perform the actual check of the masks against the event.
 	 */
 	if (sorf & AU_PRS_SUCCESS)
 		effmask |= (mask_p->am_success & class);
 
 	if (sorf & AU_PRS_FAILURE)
 		effmask |= (mask_p->am_failure & class);
 
 	if (effmask)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Convert sysctl names and present arguments to events.
  */
 au_event_t
 audit_ctlname_to_sysctlevent(int name[], uint64_t valid_arg)
 {
 
 	/* can't parse it - so return the worst case */
 	if ((valid_arg & (ARG_CTLNAME | ARG_LEN)) != (ARG_CTLNAME | ARG_LEN))
 		return (AUE_SYSCTL);
 
 	switch (name[0]) {
 	/* non-admin "lookups" treat them special */
 	case KERN_OSTYPE:
 	case KERN_OSRELEASE:
 	case KERN_OSREV:
 	case KERN_VERSION:
 	case KERN_ARGMAX:
 	case KERN_CLOCKRATE:
 	case KERN_BOOTTIME:
 	case KERN_POSIX1:
 	case KERN_NGROUPS:
 	case KERN_JOB_CONTROL:
 	case KERN_SAVED_IDS:
 	case KERN_OSRELDATE:
 	case KERN_DUMMY:
 		return (AUE_SYSCTL_NONADMIN);
 
 	/* only treat the changeable controls as admin */
 	case KERN_MAXVNODES:
 	case KERN_MAXPROC:
 	case KERN_MAXFILES:
 	case KERN_MAXPROCPERUID:
 	case KERN_MAXFILESPERPROC:
 	case KERN_HOSTID:
 	case KERN_SECURELVL:
 	case KERN_HOSTNAME:
-	case KERN_VNODE:
 	case KERN_PROC:
 	case KERN_FILE:
 	case KERN_PROF:
 	case KERN_NISDOMAINNAME:
 	case KERN_UPDATEINTERVAL:
 	case KERN_NTP_PLL:
 	case KERN_BOOTFILE:
 	case KERN_DUMPDEV:
 	case KERN_IPC:
 	case KERN_PS_STRINGS:
 	case KERN_USRSTACK:
 	case KERN_LOGSIGEXIT:
 	case KERN_IOV_MAX:
 		return ((valid_arg & ARG_VALUE) ?
 		    AUE_SYSCTL : AUE_SYSCTL_NONADMIN);
 
 	default:
 		return (AUE_SYSCTL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Convert an open flags specifier into a specific type of open event for
  * auditing purposes.
  */
 au_event_t
 audit_flags_and_error_to_openevent(int oflags, int error)
 {
 	int i;
 
 	/*
 	 * Need to check only those flags we care about.
 	 */
 	oflags = oflags & (O_RDONLY | O_CREAT | O_TRUNC | O_RDWR | O_WRONLY);
 	for (i = 0; i < nitems(aue_open); i++) {
 		if (aue_open[i].aoe_flags == oflags)
 			return (aue_open[i].aoe_event);
 	}
 	return (AUE_OPEN);
 }
 
 au_event_t
 audit_flags_and_error_to_openatevent(int oflags, int error)
 {
 	int i;
 
 	/*
 	 * Need to check only those flags we care about.
 	 */
 	oflags = oflags & (O_RDONLY | O_CREAT | O_TRUNC | O_RDWR | O_WRONLY);
 	for (i = 0; i < nitems(aue_openat); i++) {
 		if (aue_openat[i].aoe_flags == oflags)
 			return (aue_openat[i].aoe_event);
 	}
 	return (AUE_OPENAT);
 }
 
 /*
  * Convert a MSGCTL command to a specific event.
  */
 au_event_t
 audit_msgctl_to_event(int cmd)
 {
 
 	switch (cmd) {
 	case IPC_RMID:
 		return (AUE_MSGCTL_RMID);
 
 	case IPC_SET:
 		return (AUE_MSGCTL_SET);
 
 	case IPC_STAT:
 		return (AUE_MSGCTL_STAT);
 
 	default:
 		/* We will audit a bad command. */
 		return (AUE_MSGCTL);
 	}
 }
 
 /*
  * Convert a SEMCTL command to a specific event.
  */
 au_event_t
 audit_semctl_to_event(int cmd)
 {
 
 	switch (cmd) {
 	case GETALL:
 		return (AUE_SEMCTL_GETALL);
 
 	case GETNCNT:
 		return (AUE_SEMCTL_GETNCNT);
 
 	case GETPID:
 		return (AUE_SEMCTL_GETPID);
 
 	case GETVAL:
 		return (AUE_SEMCTL_GETVAL);
 
 	case GETZCNT:
 		return (AUE_SEMCTL_GETZCNT);
 
 	case IPC_RMID:
 		return (AUE_SEMCTL_RMID);
 
 	case IPC_SET:
 		return (AUE_SEMCTL_SET);
 
 	case SETALL:
 		return (AUE_SEMCTL_SETALL);
 
 	case SETVAL:
 		return (AUE_SEMCTL_SETVAL);
 
 	case IPC_STAT:
 		return (AUE_SEMCTL_STAT);
 
 	default:
 		/* We will audit a bad command. */
 		return (AUE_SEMCTL);
 	}
 }
 
 /*
  * Convert msgsys(2), semsys(2), and shmsys(2) system-call variations into
  * audit events, if possible.
  */
 au_event_t
 audit_msgsys_to_event(int which)
 {
 
 	if ((which >= 0) && (which < aue_msgsys_count))
 		return (aue_msgsys[which]);
 
 	/* Audit a bad command. */
 	return (AUE_MSGSYS);
 }
 
 au_event_t
 audit_semsys_to_event(int which)
 {
 
 	if ((which >= 0) && (which < aue_semsys_count))
 		return (aue_semsys[which]);
 
 	/* Audit a bad command. */
 	return (AUE_SEMSYS);
 }
 
 au_event_t
 audit_shmsys_to_event(int which)
 {
 
 	if ((which >= 0) && (which < aue_shmsys_count))
 		return (aue_shmsys[which]);
 
 	/* Audit a bad command. */
 	return (AUE_SHMSYS);
 }
 
 /*
  * Convert a command for the auditon() system call to a audit event.
  */
 au_event_t
 auditon_command_event(int cmd)
 {
 
 	switch(cmd) {
 	case A_GETPOLICY:
 		return (AUE_AUDITON_GPOLICY);
 
 	case A_SETPOLICY:
 		return (AUE_AUDITON_SPOLICY);
 
 	case A_GETKMASK:
 		return (AUE_AUDITON_GETKMASK);
 
 	case A_SETKMASK:
 		return (AUE_AUDITON_SETKMASK);
 
 	case A_GETQCTRL:
 		return (AUE_AUDITON_GQCTRL);
 
 	case A_SETQCTRL:
 		return (AUE_AUDITON_SQCTRL);
 
 	case A_GETCWD:
 		return (AUE_AUDITON_GETCWD);
 
 	case A_GETCAR:
 		return (AUE_AUDITON_GETCAR);
 
 	case A_GETSTAT:
 		return (AUE_AUDITON_GETSTAT);
 
 	case A_SETSTAT:
 		return (AUE_AUDITON_SETSTAT);
 
 	case A_SETUMASK:
 		return (AUE_AUDITON_SETUMASK);
 
 	case A_SETSMASK:
 		return (AUE_AUDITON_SETSMASK);
 
 	case A_GETCOND:
 		return (AUE_AUDITON_GETCOND);
 
 	case A_SETCOND:
 		return (AUE_AUDITON_SETCOND);
 
 	case A_GETCLASS:
 		return (AUE_AUDITON_GETCLASS);
 
 	case A_SETCLASS:
 		return (AUE_AUDITON_SETCLASS);
 
 	case A_GETPINFO:
 	case A_SETPMASK:
 	case A_SETFSIZE:
 	case A_GETFSIZE:
 	case A_GETPINFO_ADDR:
 	case A_GETKAUDIT:
 	case A_SETKAUDIT:
 	default:
 		return (AUE_AUDITON);	/* No special record */
 	}
 }
 
 /*
  * Create a canonical path from given path by prefixing either the root
  * directory, or the current working directory.  If the process working
  * directory is NULL, we could use 'rootvnode' to obtain the root directory,
  * but this results in a volfs name written to the audit log. So we will
  * leave the filename starting with '/' in the audit log in this case.
  */
 void
 audit_canon_path_vp(struct thread *td, struct vnode *rdir, struct vnode *cdir,
     char *path, char *cpath)
 {
 	struct vnode *vp;
 	char *rbuf, *fbuf, *copy;
 	struct sbuf sbf;
 	int error;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: at %s:%d",
 	    __func__,  __FILE__, __LINE__);
 
 	copy = path;
 	if (*path == '/') {
 		vp = rdir;
 	} else {
 		if (cdir == NULL) {
 			cpath[0] = '\0';
 			return;
 		}
 		vp = cdir;
 	}
 	MPASS(vp != NULL);
 	/*
 	 * NB: We require that the supplied array be at least MAXPATHLEN bytes
 	 * long.  If this is not the case, then we can run into serious trouble.
 	 */
 	(void) sbuf_new(&sbf, cpath, MAXPATHLEN, SBUF_FIXEDLEN);
 	/*
 	 * Strip leading forward slashes.
 	 *
 	 * Note this does nothing to fully canonicalize the path.
 	 */
 	while (*copy == '/')
 		copy++;
 	/*
 	 * Make sure we handle chroot(2) and prepend the global path to these
 	 * environments.
 	 *
 	 * NB: vn_fullpath(9) on FreeBSD is less reliable than vn_getpath(9)
 	 * on Darwin.  As a result, this may need some additional attention
 	 * in the future.
 	 */
 	error = vn_fullpath_global(vp, &rbuf, &fbuf);
 	if (error) {
 		cpath[0] = '\0';
 		return;
 	}
 	(void) sbuf_cat(&sbf, rbuf);
 	/*
 	 * We are going to concatenate the resolved path with the passed path
 	 * with all slashes removed and we want them glued with a single slash.
 	 * However, if the directory is /, the slash is already there.
 	 */
 	if (rbuf[1] != '\0')
 		(void) sbuf_putc(&sbf, '/');
 	free(fbuf, M_TEMP);
 	/*
 	 * Now that we have processed any alternate root and relative path
 	 * names, add the supplied pathname.
 	 */
 	(void) sbuf_cat(&sbf, copy);
 	/*
 	 * One or more of the previous sbuf operations could have resulted in
 	 * the supplied buffer being overflowed.  Check to see if this is the
 	 * case.
 	 */
 	if (sbuf_error(&sbf) != 0) {
 		cpath[0] = '\0';
 		return;
 	}
 	sbuf_finish(&sbf);
 }
 
 void
 audit_canon_path(struct thread *td, int dirfd, char *path, char *cpath)
 {
 	struct vnode *cdir, *rdir;
 	struct pwd *pwd;
 	cap_rights_t rights;
 	int error;
 	bool vrele_cdir;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: at %s:%d",
 	    __func__,  __FILE__, __LINE__);
 
 	pwd = pwd_hold(td);
 	rdir = pwd->pwd_rdir;
 	cdir = NULL;
 	vrele_cdir = false;
 	if (*path != '/') {
 		if (dirfd == AT_FDCWD) {
 			cdir = pwd->pwd_cdir;
 		} else {
 			error = fgetvp(td, dirfd, cap_rights_init(&rights), &cdir);
 			if (error != 0) {
 				cpath[0] = '\0';
 				pwd_drop(pwd);
 				return;
 			}
 			vrele_cdir = true;
 		}
 	}
 
 	audit_canon_path_vp(td, rdir, cdir, path, cpath);
 
 	pwd_drop(pwd);
 	if (vrele_cdir)
 		vrele(cdir);
 }
diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h
index 79da4772615f..9cfd759166bd 100644
--- a/sys/sys/sysctl.h
+++ b/sys/sys/sysctl.h
@@ -1,1214 +1,1214 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sysctl.h	8.1 (Berkeley) 6/2/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_SYSCTL_H_
 #define	_SYS_SYSCTL_H_
 
 #ifdef _KERNEL
 #include <sys/queue.h>
 #include <sys/tree.h>
 #endif
 
 /*
  * Definitions for sysctl call.  The sysctl call uses a hierarchical name
  * for objects that can be examined or modified.  The name is expressed as
  * a sequence of integers.  Like a file path name, the meaning of each
  * component depends on its place in the hierarchy.  The top-level and kern
  * identifiers are defined here, and other identifiers are defined in the
  * respective subsystem header files.
  *
  * Each subsystem defined by sysctl defines a list of variables for that
  * subsystem. Each name is either a node with further levels defined below it,
  * or it is a leaf of some particular type given below. Each sysctl level
  * defines a set of name/type pairs to be used by sysctl(8) in manipulating the
  * subsystem.
  */
 
 #define	CTL_MAXNAME	24	/* largest number of components supported */
 
 #define	CTLTYPE		0xf	/* mask for the type */
 #define	CTLTYPE_NODE	1	/* name is a node */
 #define	CTLTYPE_INT	2	/* name describes an integer */
 #define	CTLTYPE_STRING	3	/* name describes a string */
 #define	CTLTYPE_S64	4	/* name describes a signed 64-bit number */
 #define	CTLTYPE_OPAQUE	5	/* name describes a structure */
 #define	CTLTYPE_STRUCT	CTLTYPE_OPAQUE	/* name describes a structure */
 #define	CTLTYPE_UINT	6	/* name describes an unsigned integer */
 #define	CTLTYPE_LONG	7	/* name describes a long */
 #define	CTLTYPE_ULONG	8	/* name describes an unsigned long */
 #define	CTLTYPE_U64	9	/* name describes an unsigned 64-bit number */
 #define	CTLTYPE_U8	0xa	/* name describes an unsigned 8-bit number */
 #define	CTLTYPE_U16	0xb	/* name describes an unsigned 16-bit number */
 #define	CTLTYPE_S8	0xc	/* name describes a signed 8-bit number */
 #define	CTLTYPE_S16	0xd	/* name describes a signed 16-bit number */
 #define	CTLTYPE_S32	0xe	/* name describes a signed 32-bit number */
 #define	CTLTYPE_U32	0xf	/* name describes an unsigned 32-bit number */
 
 #define	CTLFLAG_RD	0x80000000	/* Allow reads of variable */
 #define	CTLFLAG_WR	0x40000000	/* Allow writes to the variable */
 #define	CTLFLAG_RW	(CTLFLAG_RD|CTLFLAG_WR)
 #define	CTLFLAG_DORMANT	0x20000000	/* This sysctl is not active yet */
 #define	CTLFLAG_ANYBODY	0x10000000	/* All users can set this var */
 #define	CTLFLAG_SECURE	0x08000000	/* Permit set only if securelevel<=0 */
 #define	CTLFLAG_PRISON	0x04000000	/* Prisoned roots can fiddle */
 #define	CTLFLAG_DYN	0x02000000	/* Dynamic oid - can be freed */
 #define	CTLFLAG_SKIP	0x01000000	/* Skip this sysctl when listing */
 #define	CTLMASK_SECURE	0x00F00000	/* Secure level */
 #define	CTLFLAG_TUN	0x00080000	/* Default value is loaded from getenv() */
 #define	CTLFLAG_RDTUN	(CTLFLAG_RD|CTLFLAG_TUN)
 #define	CTLFLAG_RWTUN	(CTLFLAG_RW|CTLFLAG_TUN)
 #define	CTLFLAG_MPSAFE	0x00040000	/* Handler is MP safe */
 #define	CTLFLAG_VNET	0x00020000	/* Prisons with vnet can fiddle */
 #define	CTLFLAG_DYING	0x00010000	/* Oid is being removed */
 #define	CTLFLAG_CAPRD	0x00008000	/* Can be read in capability mode */
 #define	CTLFLAG_CAPWR	0x00004000	/* Can be written in capability mode */
 #define	CTLFLAG_STATS	0x00002000	/* Statistics, not a tuneable */
 #define	CTLFLAG_NOFETCH	0x00001000	/* Don't fetch tunable from getenv() */
 #define	CTLFLAG_CAPRW	(CTLFLAG_CAPRD|CTLFLAG_CAPWR)
 /*
  * This is transient flag to be used until all sysctl handlers are converted
  * to not lock Giant.
  * One, and only one of CTLFLAG_MPSAFE or CTLFLAG_NEEDGIANT is required
  * for SYSCTL_PROC and SYSCTL_NODE.
  */
 #define	CTLFLAG_NEEDGIANT 0x00000800	/* Handler require Giant */
 
 /*
  * Secure level.   Note that CTLFLAG_SECURE == CTLFLAG_SECURE1.
  *
  * Secure when the securelevel is raised to at least N.
  */
 #define	CTLSHIFT_SECURE	20
 #define	CTLFLAG_SECURE1	(CTLFLAG_SECURE | (0 << CTLSHIFT_SECURE))
 #define	CTLFLAG_SECURE2	(CTLFLAG_SECURE | (1 << CTLSHIFT_SECURE))
 #define	CTLFLAG_SECURE3	(CTLFLAG_SECURE | (2 << CTLSHIFT_SECURE))
 
 /*
  * USE THIS instead of a hardwired number from the categories below
  * to get dynamically assigned sysctl entries using the linker-set
  * technology. This is the way nearly all new sysctl variables should
  * be implemented.
  * e.g. SYSCTL_INT(_parent, OID_AUTO, name, CTLFLAG_RW, &variable, 0, "");
  */
 #define	OID_AUTO	(-1)
 
 /*
  * The starting number for dynamically-assigned entries.  WARNING!
  * ALL static sysctl entries should have numbers LESS than this!
  */
 #define	CTL_AUTO_START	0x100
 
 #ifdef _KERNEL
 #include <sys/linker_set.h>
 
 #ifdef KLD_MODULE
 /* XXX allow overspecification of type in external kernel modules */
 #define	SYSCTL_CT_ASSERT_MASK CTLTYPE
 #else
 #define	SYSCTL_CT_ASSERT_MASK 0
 #endif
 
 #define	SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1,	\
 	intmax_t arg2, struct sysctl_req *req
 
 /* definitions for sysctl_req 'lock' member */
 #define	REQ_UNWIRED	1
 #define	REQ_WIRED	2
 
 /* definitions for sysctl_req 'flags' member */
 #ifdef COMPAT_FREEBSD32
 #define	SCTL_MASK32	1	/* 32 bit emulation */
 #endif
 
 /*
  * This describes the access space for a sysctl request.  This is needed
  * so that we can use the interface from the kernel or from user-space.
  */
 struct thread;
 struct sysctl_req {
 	struct thread	*td;		/* used for access checking */
 	int		 lock;		/* wiring state */
 	void		*oldptr;
 	size_t		 oldlen;
 	size_t		 oldidx;
 	int		(*oldfunc)(struct sysctl_req *, const void *, size_t);
 	const void		*newptr;
 	size_t		 newlen;
 	size_t		 newidx;
 	int		(*newfunc)(struct sysctl_req *, void *, size_t);
 	size_t		 validlen;
 	int		 flags;
 };
 
 struct sysctl_oid;
 
 /* RB Tree handling */
 RB_HEAD(sysctl_oid_list, sysctl_oid);
 
 /*
  * This describes one "oid" in the MIB tree.  Potentially more nodes can
  * be hidden behind it, expanded by the handler.
  */
 struct sysctl_oid {
 	struct sysctl_oid_list	oid_children;
 	struct sysctl_oid_list*	oid_parent;
 	RB_ENTRY(sysctl_oid) oid_link;
 	/* Sort key for all siblings, and lookup key for userland */
 	int		 oid_number;
 	u_int		 oid_kind;
 	void		*oid_arg1;
 	intmax_t	 oid_arg2;
 	/* Must be unique amongst all siblings. */
 	const char	*oid_name;
 	int		(*oid_handler)(SYSCTL_HANDLER_ARGS);
 	const char	*oid_fmt;
 	int		 oid_refcnt;
 	u_int		 oid_running;
 	const char	*oid_descr;
 	const char	*oid_label;
 };
 
 static inline int
 cmp_sysctl_oid(struct sysctl_oid *a, struct sysctl_oid *b)
 {
 	if (a->oid_number > b->oid_number)
 		return (1);
 	else if (a->oid_number < b->oid_number)
 		return (-1);
 	else
 		return (0);
 }
 
 RB_PROTOTYPE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid);
 
 #define	SYSCTL_IN(r, p, l)	(r->newfunc)(r, p, l)
 #define	SYSCTL_OUT(r, p, l)	(r->oldfunc)(r, p, l)
 #define	SYSCTL_OUT_STR(r, p)	(r->oldfunc)(r, p, strlen(p) + 1)
 
 int sysctl_handle_bool(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_8(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_16(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_32(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_64(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
 int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_long(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_string(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS);
 
 int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS);
 
 int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS);
 int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS);
 int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS);
 
 int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS);
 int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS);
 int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS);
 
 /*
  * These functions are used to add/remove an oid from the mib.
  */
 void sysctl_register_oid(struct sysctl_oid *oidp);
 void sysctl_register_disabled_oid(struct sysctl_oid *oidp);
 void sysctl_enable_oid(struct sysctl_oid *oidp);
 void sysctl_unregister_oid(struct sysctl_oid *oidp);
 
 /* Declare a static oid to allow child oids to be added to it. */
 #define	SYSCTL_DECL(name)			\
 	extern struct sysctl_oid sysctl__##name
 
 /* Hide these in macros. */
 #define	SYSCTL_CHILDREN(oid_ptr)		(&(oid_ptr)->oid_children)
 #define	SYSCTL_PARENT(oid_ptr)					\
     (((oid_ptr)->oid_parent != &sysctl__children) ?		\
 	__containerof((oid_ptr)->oid_parent, struct sysctl_oid,	\
 	oid_children) : (struct sysctl_oid *)NULL)
 #define	SYSCTL_STATIC_CHILDREN(oid_name)	(&sysctl__##oid_name.oid_children)
 
 /* === Structs and macros related to context handling. === */
 
 /* All dynamically created sysctls can be tracked in a context list. */
 struct sysctl_ctx_entry {
 	struct sysctl_oid *entry;
 	TAILQ_ENTRY(sysctl_ctx_entry) link;
 };
 
 TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry);
 
 #define	SYSCTL_NODE_CHILDREN(parent, name) \
 	sysctl__##parent##_##name.oid_children
 
 #ifndef NO_SYSCTL_DESCR
 #define	__DESCR(d) d
 #else
 #define	__DESCR(d) ""
 #endif
 
 #ifdef	notyet
 #define	SYSCTL_ENFORCE_FLAGS(x)						\
     _Static_assert((((x) & CTLFLAG_MPSAFE) != 0) ^ (((x) & CTLFLAG_NEEDGIANT) != 0), \
         "Has to be either CTLFLAG_MPSAFE or CTLFLAG_NEEDGIANT")
 #else
 #define	SYSCTL_ENFORCE_FLAGS(x)
 #endif
 
 /* This macro is only for internal use */
 #define	SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, a2, handler, fmt, descr, label) \
 	struct sysctl_oid id = {					\
 		.oid_parent = (parent_child_head),			\
 		.oid_children = RB_INITIALIZER(&id.oid_children), \
 		.oid_number = (nbr),					\
 		.oid_kind = (kind),					\
 		.oid_arg1 = (a1),					\
 		.oid_arg2 = (a2),					\
 		.oid_name = (name),					\
 		.oid_handler = (handler),				\
 		.oid_fmt = (fmt),					\
 		.oid_descr = __DESCR(descr),				\
 		.oid_label = (label),					\
 	};								\
 	DATA_SET(sysctl_set, id);					\
 	SYSCTL_ENFORCE_FLAGS(kind)
 
 /* This constructs a static "raw" MIB oid. */
 #define	SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
 	SYSCTL_OID_WITH_LABEL(parent, nbr, name, kind, a1, a2,		\
 	    handler, fmt, descr, NULL)
 
 #define	SYSCTL_OID_WITH_LABEL(parent, nbr, name, kind, a1, a2, handler, fmt, descr, label) \
     static SYSCTL_OID_RAW(sysctl__##parent##_##name,			\
 	SYSCTL_CHILDREN(&sysctl__##parent),				\
 	nbr, #name, kind, a1, a2, handler, fmt, descr, label)
 
 /* This constructs a global "raw" MIB oid. */
 #define	SYSCTL_OID_GLOBAL(parent, nbr, name, kind, a1, a2, handler, fmt, descr, label) \
     SYSCTL_OID_RAW(sysctl__##parent##_##name, \
 	SYSCTL_CHILDREN(&sysctl__##parent),	\
 	nbr, #name, kind, a1, a2, handler, fmt, descr, label)
 
 #define	SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
 ({									\
 	SYSCTL_ENFORCE_FLAGS(kind);					\
 	sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2,handler,	\
 	    fmt, __DESCR(descr), NULL);					\
 })
 
 /* This constructs a root node from which other nodes can hang. */
 #define	SYSCTL_ROOT_NODE(nbr, name, access, handler, descr)	\
 	SYSCTL_OID_RAW(sysctl___##name, &sysctl__children,	\
 	    nbr, #name, CTLTYPE_NODE|(access), NULL, 0,		\
 	    handler, "N", descr, NULL);				\
 	CTASSERT(((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE)
 
 /* This constructs a node from which other oids can hang. */
 #define	SYSCTL_NODE(parent, nbr, name, access, handler, descr) \
 	SYSCTL_NODE_WITH_LABEL(parent, nbr, name, access, handler, descr, NULL)
 
 #define	SYSCTL_NODE_WITH_LABEL(parent, nbr, name, access, handler, descr, label) \
 	SYSCTL_OID_GLOBAL(parent, nbr, name, CTLTYPE_NODE|(access),	\
 	    NULL, 0, handler, "N", descr, label);			\
 	SYSCTL_ENFORCE_FLAGS(access);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE)
 
 #define	SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr)	\
 	SYSCTL_ADD_NODE_WITH_LABEL(ctx, parent, nbr, name, access, \
 	    handler, descr, NULL)
 
 #define	SYSCTL_ADD_NODE_WITH_LABEL(ctx, parent, nbr, name, access, handler, descr, label) \
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE);	\
 	SYSCTL_ENFORCE_FLAGS(access);					\
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_NODE|(access),	\
 	    NULL, 0, handler, "N", __DESCR(descr), label);		\
 })
 
 #define	SYSCTL_ADD_ROOT_NODE(ctx, nbr, name, access, handler, descr)	\
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE);	\
 	SYSCTL_ENFORCE_FLAGS(access);					\
 	sysctl_add_oid(ctx, &sysctl__children, nbr, name,		\
 	    CTLTYPE_NODE|(access),					\
 	    NULL, 0, handler, "N", __DESCR(descr), NULL);		\
 })
 
 /* Oid for a string.  len can be 0 to indicate '\0' termination. */
 #define	SYSCTL_STRING(parent, nbr, name, access, arg, len, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_STRING | CTLFLAG_MPSAFE | (access),			\
 	    arg, len, sysctl_handle_string, "A", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING)
 
 #define	SYSCTL_ADD_STRING(ctx, parent, nbr, name, access, arg, len, descr) \
 ({									\
 	char *__arg = (arg);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_STRING | CTLFLAG_MPSAFE | (access),			\
 	    __arg, len, sysctl_handle_string, "A", __DESCR(descr),	\
 	    NULL); \
 })
 
 /* Oid for a constant '\0' terminated string. */
 #define	SYSCTL_CONST_STRING(parent, nbr, name, access, arg, descr)	\
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING | CTLFLAG_MPSAFE | (access),\
 	    __DECONST(char *, arg), 0, sysctl_handle_string, "A", descr); \
 	CTASSERT(!(access & CTLFLAG_WR));				\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING)
 
 #define	SYSCTL_ADD_CONST_STRING(ctx, parent, nbr, name, access, arg, descr) \
 ({									\
 	char *__arg = __DECONST(char *, arg);				\
 	CTASSERT(!(access & CTLFLAG_WR));				\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING);	\
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING | 	\
 	    CTLFLAG_MPSAFE | (access), __arg, 0, sysctl_handle_string, "A",\
 	    __DESCR(descr), NULL); 					\
 })
 
 /* Oid for a bool.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_BOOL_PTR ((bool *)NULL)
 #define	SYSCTL_BOOL(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_bool, "CU", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 &&			\
 	    sizeof(bool) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_BOOL(ctx, parent, nbr, name, access, ptr, val, descr) \
 ({									\
 	bool *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0);				\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_bool, "CU", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* Oid for a signed 8-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_S8_PTR ((int8_t *)NULL)
 #define	SYSCTL_S8(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S8 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_8, "C", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8) && \
 	    sizeof(int8_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_S8(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int8_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S8 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_8, "C", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned 8-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_U8_PTR ((uint8_t *)NULL)
 #define	SYSCTL_U8(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_8, "CU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8) && \
 	    sizeof(uint8_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_U8(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	uint8_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_8, "CU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a signed 16-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_S16_PTR ((int16_t *)NULL)
 #define	SYSCTL_S16(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S16 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_16, "S", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16) && \
 	    sizeof(int16_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_S16(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int16_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S16 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_16, "S", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned 16-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_U16_PTR ((uint16_t *)NULL)
 #define	SYSCTL_U16(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U16 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_16, "SU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16) && \
 	    sizeof(uint16_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_U16(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	uint16_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U16 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_16, "SU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a signed 32-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_S32_PTR ((int32_t *)NULL)
 #define	SYSCTL_S32(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S32 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_32, "I", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32) && \
 	    sizeof(int32_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_S32(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int32_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S32 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_32, "I", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned 32-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_U32_PTR ((uint32_t *)NULL)
 #define	SYSCTL_U32(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U32 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_32, "IU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32) && \
 	    sizeof(uint32_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_U32(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	uint32_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U32 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_32, "IU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a signed 64-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_S64_PTR ((int64_t *)NULL)
 #define	SYSCTL_S64(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_64, "Q", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \
 	    sizeof(int64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_S64(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int64_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_64, "Q", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned 64-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_U64_PTR ((uint64_t *)NULL)
 #define	SYSCTL_U64(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_64, "QU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \
 	    sizeof(uint64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_U64(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	uint64_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_64, "QU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an int.  If ptr is SYSCTL_NULL_INT_PTR, val is returned. */
 #define	SYSCTL_NULL_INT_PTR ((int *)NULL)
 #define	SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_INT_WITH_LABEL(parent, nbr, name, access, ptr, val, descr, NULL)
 
 #define	SYSCTL_INT_WITH_LABEL(parent, nbr, name, access, ptr, val, descr, label) \
 	SYSCTL_OID_WITH_LABEL(parent, nbr, name,			\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),			\
 	    ptr, val, sysctl_handle_int, "I", descr, label);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) && \
 	    sizeof(int) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_int, "I", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_UINT_PTR ((unsigned *)NULL)
 #define	SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_UINT | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_int, "IU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT) && \
 	    sizeof(unsigned) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_UINT(ctx, parent, nbr, name, access, ptr, val, descr) \
 ({									\
 	unsigned *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_UINT | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_int, "IU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a long.  The pointer must be non NULL. */
 #define	SYSCTL_NULL_LONG_PTR ((long *)NULL)
 #define	SYSCTL_LONG(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_LONG | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_long, "L", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG) && \
 	    sizeof(long) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_LONG(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	long *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_LONG | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_long, "L", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned long.  The pointer must be non NULL. */
 #define	SYSCTL_NULL_ULONG_PTR ((unsigned long *)NULL)
 #define	SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access),			\
 	    ptr, val, sysctl_handle_long, "LU", descr);			\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG) &&	\
 	    sizeof(unsigned long) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_ULONG(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	unsigned long *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_long, "LU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a quad.  The pointer must be non NULL. */
 #define	SYSCTL_NULL_QUAD_PTR ((int64_t *)NULL)
 #define	SYSCTL_QUAD(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_64, "Q", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \
 	    sizeof(int64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_QUAD(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	int64_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_64, "Q", __DESCR(descr), NULL);	\
 })
 
 #define	SYSCTL_NULL_UQUAD_PTR ((uint64_t *)NULL)
 #define	SYSCTL_UQUAD(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
 	     ptr, val, sysctl_handle_64, "QU", descr);			\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) &&	\
 	    sizeof(uint64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_UQUAD(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	uint64_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_64, "QU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a CPU dependent variable */
 #define	SYSCTL_ADD_UAUTO(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	struct sysctl_oid *__ret;					\
 	CTASSERT((sizeof(uint64_t) == sizeof(*(ptr)) ||			\
 	    sizeof(unsigned) == sizeof(*(ptr))) &&			\
 	    ((access) & CTLTYPE) == 0);					\
 	if (sizeof(uint64_t) == sizeof(*(ptr))) {			\
 		__ret = sysctl_add_oid(ctx, parent, nbr, name,		\
 		    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),		\
 		    (ptr), 0, sysctl_handle_64, "QU",			\
 		    __DESCR(descr), NULL);				\
 	} else {							\
 		__ret = sysctl_add_oid(ctx, parent, nbr, name,		\
 		    CTLTYPE_UINT | CTLFLAG_MPSAFE | (access),		\
 		    (ptr), 0, sysctl_handle_int, "IU",			\
 		    __DESCR(descr), NULL);				\
 	}								\
 	__ret;								\
 })
 
 /* Oid for a 64-bit unsigned counter(9).  The pointer must be non NULL. */
 #define	SYSCTL_COUNTER_U64(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access),	\
 	    (ptr), 0, sysctl_handle_counter_u64, "QU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) &&	\
 	    sizeof(counter_u64_t) == sizeof(*(ptr)) &&			\
 	    sizeof(uint64_t) == sizeof(**(ptr)))
 
 #define	SYSCTL_ADD_COUNTER_U64(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
 	counter_u64_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access),	\
 	    __ptr, 0, sysctl_handle_counter_u64, "QU", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* Oid for an array of counter(9)s.  The pointer and length must be non zero. */
 #define	SYSCTL_COUNTER_U64_ARRAY(parent, nbr, name, access, ptr, len, descr) \
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access),	\
 	    (ptr), (len), sysctl_handle_counter_u64_array, "QU", descr);\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) &&	\
 	    sizeof(counter_u64_t) == sizeof(*(ptr)) &&			\
 	    sizeof(uint64_t) == sizeof(**(ptr)))
 
 #define	SYSCTL_ADD_COUNTER_U64_ARRAY(ctx, parent, nbr, name, access,	\
     ptr, len, descr)							\
 ({									\
 	counter_u64_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access),	\
 	    __ptr, len, sysctl_handle_counter_u64_array, "S",		\
 	    __DESCR(descr), NULL);					\
 })
 
 /* Oid for an opaque object.  Specified by a pointer and a length. */
 #define	SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
 	    ptr, len, sysctl_handle_opaque, fmt, descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE)
 
 #define	SYSCTL_ADD_OPAQUE(ctx, parent, nbr, name, access, ptr, len, fmt, descr)	\
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
 	    ptr, len, sysctl_handle_opaque, fmt, __DESCR(descr), NULL);	\
 })
 
 /* Oid for a struct.  Specified by a pointer and a type. */
 #define	SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
 	    ptr, sizeof(struct type), sysctl_handle_opaque,		\
 	    "S," #type, descr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE)
 
 #define	SYSCTL_ADD_STRUCT(ctx, parent, nbr, name, access, ptr, type, descr) \
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
 	    (ptr), sizeof(struct type),					\
 	    sysctl_handle_opaque, "S," #type, __DESCR(descr), NULL);	\
 })
 
 /* Oid for a procedure.  Specified by a pointer and an arg. */
 #define	SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
 	SYSCTL_OID(parent, nbr, name, (access),				\
 	    ptr, arg, handler, fmt, descr);				\
 	CTASSERT(((access) & CTLTYPE) != 0)
 
 #define	SYSCTL_ADD_PROC(ctx, parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
 ({									\
 	CTASSERT(((access) & CTLTYPE) != 0);				\
 	SYSCTL_ENFORCE_FLAGS(access);					\
 	sysctl_add_oid(ctx, parent, nbr, name, (access),		\
 	    (ptr), (arg), (handler), (fmt), __DESCR(descr), NULL);	\
 })
 
 /* Oid to handle limits on uma(9) zone specified by pointer. */
 #define	SYSCTL_UMA_MAX(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),		\
 	    (ptr), 0, sysctl_handle_uma_zone_max, "I", descr);	\
 	CTASSERT(((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT)
 
 #define	SYSCTL_ADD_UMA_MAX(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	uma_zone_t __ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_uma_zone_max, "I", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* Oid to obtain current use of uma(9) zone specified by pointer. */
 #define	SYSCTL_UMA_CUR(parent, nbr, name, access, ptr, descr)		\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    (ptr), 0, sysctl_handle_uma_zone_cur, "I", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT)
 
 #define	SYSCTL_ADD_UMA_CUR(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	uma_zone_t __ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    __ptr, 0, sysctl_handle_uma_zone_cur, "I", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* OID expressing a sbintime_t as microseconds */
 #define	SYSCTL_SBINTIME_USEC(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    (ptr), 0, sysctl_usec_to_sbintime, "Q", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64)
 #define	SYSCTL_ADD_SBINTIME_USEC(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
 	sbintime_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    __ptr, 0, sysctl_usec_to_sbintime, "Q", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* OID expressing a sbintime_t as milliseconds */
 #define	SYSCTL_SBINTIME_MSEC(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    (ptr), 0, sysctl_msec_to_sbintime, "Q", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64)
 #define	SYSCTL_ADD_SBINTIME_MSEC(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
 	sbintime_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    __ptr, 0, sysctl_msec_to_sbintime, "Q", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* OID expressing a struct timeval as seconds */
 #define	SYSCTL_TIMEVAL_SEC(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    (ptr), 0, sysctl_sec_to_timeval, "I", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT)
 #define	SYSCTL_ADD_TIMEVAL_SEC(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
 	struct timeval *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    __ptr, 0, sysctl_sec_to_timeval, "I", __DESCR(descr),	\
 	    NULL);							\
 })
 
 #define	SYSCTL_FOREACH(oidp, list) \
 	RB_FOREACH(oidp, sysctl_oid_list, list)
 
 /*
  * A macro to generate a read-only sysctl to indicate the presence of optional
  * kernel features.
  */
 #define	FEATURE(name, desc)						\
 	SYSCTL_INT_WITH_LABEL(_kern_features, OID_AUTO, name,		\
 	    CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1, desc, "feature")
 
 #endif /* _KERNEL */
 
 /*
  * Top-level identifiers
  */
 #define	CTL_SYSCTL	0		/* "magic" numbers */
 #define	CTL_KERN	1		/* "high kernel": proc, limits */
 #define	CTL_VM		2		/* virtual memory */
 #define	CTL_VFS		3		/* filesystem, mount type is next */
 #define	CTL_NET		4		/* network, see socket.h */
 #define	CTL_DEBUG	5		/* debugging parameters */
 #define	CTL_HW		6		/* generic cpu/io */
 #define	CTL_MACHDEP	7		/* machine dependent */
 #define	CTL_USER	8		/* user-level */
 #define	CTL_P1003_1B	9		/* POSIX 1003.1B */
 
 /*
  * CTL_SYSCTL identifiers
  */
 #define	CTL_SYSCTL_DEBUG	0	/* printf all nodes */
 #define	CTL_SYSCTL_NAME		1	/* string name of OID */
 #define	CTL_SYSCTL_NEXT		2	/* next OID, honoring CTLFLAG_SKIP */
 #define	CTL_SYSCTL_NAME2OID	3	/* int array of name */
 #define	CTL_SYSCTL_OIDFMT	4	/* OID's kind and format */
 #define	CTL_SYSCTL_OIDDESCR	5	/* OID's description */
 #define	CTL_SYSCTL_OIDLABEL	6	/* aggregation label */
 #define	CTL_SYSCTL_NEXTNOSKIP	7	/* next OID, ignoring CTLFLAG_SKIP */
 
 /*
  * CTL_KERN identifiers
  */
 #define	KERN_OSTYPE		 1	/* string: system version */
 #define	KERN_OSRELEASE		 2	/* string: system release */
 #define	KERN_OSREV		 3	/* int: system revision */
 #define	KERN_VERSION		 4	/* string: compile time info */
 #define	KERN_MAXVNODES		 5	/* int: max vnodes */
 #define	KERN_MAXPROC		 6	/* int: max processes */
 #define	KERN_MAXFILES		 7	/* int: max open files */
 #define	KERN_ARGMAX		 8	/* int: max arguments to exec */
 #define	KERN_SECURELVL		 9	/* int: system security level */
 #define	KERN_HOSTNAME		10	/* string: hostname */
 #define	KERN_HOSTID		11	/* int: host identifier */
 #define	KERN_CLOCKRATE		12	/* struct: struct clockrate */
-#define	KERN_VNODE		13	/* struct: vnode structures */
+/* was: #define	KERN_VNODE	13	; disabled in 2003 and removed in 2023 */
 #define	KERN_PROC		14	/* struct: process entries */
 #define	KERN_FILE		15	/* struct: file entries */
 #define	KERN_PROF		16	/* node: kernel profiling info */
 #define	KERN_POSIX1		17	/* int: POSIX.1 version */
 #define	KERN_NGROUPS		18	/* int: # of supplemental group ids */
 #define	KERN_JOB_CONTROL	19	/* int: is job control available */
 #define	KERN_SAVED_IDS		20	/* int: saved set-user/group-ID */
 #define	KERN_BOOTTIME		21	/* struct: time kernel was booted */
 #define	KERN_NISDOMAINNAME	22	/* string: YP domain name */
 #define	KERN_UPDATEINTERVAL	23	/* int: update process sleep time */
 #define	KERN_OSRELDATE		24	/* int: kernel release date */
 #define	KERN_NTP_PLL		25	/* node: NTP PLL control */
 #define	KERN_BOOTFILE		26	/* string: name of booted kernel */
 #define	KERN_MAXFILESPERPROC	27	/* int: max open files per proc */
 #define	KERN_MAXPROCPERUID	28	/* int: max processes per uid */
 #define	KERN_DUMPDEV		29	/* struct cdev *: device to dump on */
 #define	KERN_IPC		30	/* node: anything related to IPC */
 #define	KERN_DUMMY		31	/* unused */
 #define	KERN_PS_STRINGS		32	/* int: address of PS_STRINGS */
 #define	KERN_USRSTACK		33	/* int: address of USRSTACK */
 #define	KERN_LOGSIGEXIT		34	/* int: do we log sigexit procs? */
 #define	KERN_IOV_MAX		35	/* int: value of UIO_MAXIOV */
 #define	KERN_HOSTUUID		36	/* string: host UUID identifier */
 #define	KERN_ARND		37	/* int: from arc4rand() */
 #define	KERN_MAXPHYS		38	/* int: MAXPHYS value */
 #define	KERN_LOCKF		39	/* struct: lockf reports */
 /*
  * KERN_PROC subtypes
  */
 #define	KERN_PROC_ALL		0	/* everything */
 #define	KERN_PROC_PID		1	/* by process id */
 #define	KERN_PROC_PGRP		2	/* by process group id */
 #define	KERN_PROC_SESSION	3	/* by session of pid */
 #define	KERN_PROC_TTY		4	/* by controlling tty */
 #define	KERN_PROC_UID		5	/* by effective uid */
 #define	KERN_PROC_RUID		6	/* by real uid */
 #define	KERN_PROC_ARGS		7	/* get/set arguments/proctitle */
 #define	KERN_PROC_PROC		8	/* only return procs */
 #define	KERN_PROC_SV_NAME	9	/* get syscall vector name */
 #define	KERN_PROC_RGID		10	/* by real group id */
 #define	KERN_PROC_GID		11	/* by effective group id */
 #define	KERN_PROC_PATHNAME	12	/* path to executable */
 #define	KERN_PROC_OVMMAP	13	/* Old VM map entries for process */
 #define	KERN_PROC_OFILEDESC	14	/* Old file descriptors for process */
 #define	KERN_PROC_KSTACK	15	/* Kernel stacks for process */
 #define	KERN_PROC_INC_THREAD	0x10	/*
 					 * modifier for pid, pgrp, tty,
 					 * uid, ruid, gid, rgid and proc
 					 * This effectively uses 16-31
 					 */
 #define	KERN_PROC_VMMAP		32	/* VM map entries for process */
 #define	KERN_PROC_FILEDESC	33	/* File descriptors for process */
 #define	KERN_PROC_GROUPS	34	/* process groups */
 #define	KERN_PROC_ENV		35	/* get environment */
 #define	KERN_PROC_AUXV		36	/* get ELF auxiliary vector */
 #define	KERN_PROC_RLIMIT	37	/* process resource limits */
 #define	KERN_PROC_PS_STRINGS	38	/* get ps_strings location */
 #define	KERN_PROC_UMASK		39	/* process umask */
 #define	KERN_PROC_OSREL		40	/* osreldate for process binary */
 #define	KERN_PROC_SIGTRAMP	41	/* signal trampoline location */
 #define	KERN_PROC_CWD		42	/* process current working directory */
 #define	KERN_PROC_NFDS		43	/* number of open file descriptors */
 #define	KERN_PROC_SIGFASTBLK	44	/* address of fastsigblk magic word */
 #define	KERN_PROC_VM_LAYOUT	45	/* virtual address space layout info */
 
 /*
  * KERN_IPC identifiers
  */
 #define	KIPC_MAXSOCKBUF		1	/* int: max size of a socket buffer */
 #define	KIPC_SOCKBUF_WASTE	2	/* int: wastage factor in sockbuf */
 #define	KIPC_SOMAXCONN		3	/* int: max length of connection q */
 #define	KIPC_MAX_LINKHDR	4	/* int: max length of link header */
 #define	KIPC_MAX_PROTOHDR	5	/* int: max length of network header */
 #define	KIPC_MAX_HDR		6	/* int: max total length of headers */
 #define	KIPC_MAX_DATALEN	7	/* int: max length of data? */
 
 /*
  * CTL_HW identifiers
  */
 #define	HW_MACHINE	 1		/* string: machine class */
 #define	HW_MODEL	 2		/* string: specific machine model */
 #define	HW_NCPU		 3		/* int: number of cpus */
 #define	HW_BYTEORDER	 4		/* int: machine byte order */
 #define	HW_PHYSMEM	 5		/* int: total memory */
 #define	HW_USERMEM	 6		/* int: non-kernel memory */
 #define	HW_PAGESIZE	 7		/* int: software page size */
 #define	HW_DISKNAMES	 8		/* strings: disk drive names */
 #define	HW_DISKSTATS	 9		/* struct: diskstats[] */
 #define	HW_FLOATINGPT	10		/* int: has HW floating point? */
 #define	HW_MACHINE_ARCH	11		/* string: machine architecture */
 #define	HW_REALMEM	12		/* int: 'real' memory */
 
 /*
  * CTL_USER definitions
  */
 #define	USER_CS_PATH		 1	/* string: _CS_PATH */
 #define	USER_BC_BASE_MAX	 2	/* int: BC_BASE_MAX */
 #define	USER_BC_DIM_MAX		 3	/* int: BC_DIM_MAX */
 #define	USER_BC_SCALE_MAX	 4	/* int: BC_SCALE_MAX */
 #define	USER_BC_STRING_MAX	 5	/* int: BC_STRING_MAX */
 #define	USER_COLL_WEIGHTS_MAX	 6	/* int: COLL_WEIGHTS_MAX */
 #define	USER_EXPR_NEST_MAX	 7	/* int: EXPR_NEST_MAX */
 #define	USER_LINE_MAX		 8	/* int: LINE_MAX */
 #define	USER_RE_DUP_MAX		 9	/* int: RE_DUP_MAX */
 #define	USER_POSIX2_VERSION	10	/* int: POSIX2_VERSION */
 #define	USER_POSIX2_C_BIND	11	/* int: POSIX2_C_BIND */
 #define	USER_POSIX2_C_DEV	12	/* int: POSIX2_C_DEV */
 #define	USER_POSIX2_CHAR_TERM	13	/* int: POSIX2_CHAR_TERM */
 #define	USER_POSIX2_FORT_DEV	14	/* int: POSIX2_FORT_DEV */
 #define	USER_POSIX2_FORT_RUN	15	/* int: POSIX2_FORT_RUN */
 #define	USER_POSIX2_LOCALEDEF	16	/* int: POSIX2_LOCALEDEF */
 #define	USER_POSIX2_SW_DEV	17	/* int: POSIX2_SW_DEV */
 #define	USER_POSIX2_UPE		18	/* int: POSIX2_UPE */
 #define	USER_STREAM_MAX		19	/* int: POSIX2_STREAM_MAX */
 #define	USER_TZNAME_MAX		20	/* int: POSIX2_TZNAME_MAX */
 #define	USER_LOCALBASE		21	/* string: _PATH_LOCALBASE */
 
 #define	CTL_P1003_1B_ASYNCHRONOUS_IO		1	/* boolean */
 #define	CTL_P1003_1B_MAPPED_FILES		2	/* boolean */
 #define	CTL_P1003_1B_MEMLOCK			3	/* boolean */
 #define	CTL_P1003_1B_MEMLOCK_RANGE		4	/* boolean */
 #define	CTL_P1003_1B_MEMORY_PROTECTION		5	/* boolean */
 #define	CTL_P1003_1B_MESSAGE_PASSING		6	/* boolean */
 #define	CTL_P1003_1B_PRIORITIZED_IO		7	/* boolean */
 #define	CTL_P1003_1B_PRIORITY_SCHEDULING	8	/* boolean */
 #define	CTL_P1003_1B_REALTIME_SIGNALS		9	/* boolean */
 #define	CTL_P1003_1B_SEMAPHORES			10	/* boolean */
 #define	CTL_P1003_1B_FSYNC			11	/* boolean */
 #define	CTL_P1003_1B_SHARED_MEMORY_OBJECTS	12	/* boolean */
 #define	CTL_P1003_1B_SYNCHRONIZED_IO		13	/* boolean */
 #define	CTL_P1003_1B_TIMERS			14	/* boolean */
 #define	CTL_P1003_1B_AIO_LISTIO_MAX		15	/* int */
 #define	CTL_P1003_1B_AIO_MAX			16	/* int */
 #define	CTL_P1003_1B_AIO_PRIO_DELTA_MAX		17	/* int */
 #define	CTL_P1003_1B_DELAYTIMER_MAX		18	/* int */
 #define	CTL_P1003_1B_MQ_OPEN_MAX		19	/* int */
 #define	CTL_P1003_1B_PAGESIZE			20	/* int */
 #define	CTL_P1003_1B_RTSIG_MAX			21	/* int */
 #define	CTL_P1003_1B_SEM_NSEMS_MAX		22	/* int */
 #define	CTL_P1003_1B_SEM_VALUE_MAX		23	/* int */
 #define	CTL_P1003_1B_SIGQUEUE_MAX		24	/* int */
 #define	CTL_P1003_1B_TIMER_MAX			25	/* int */
 
 #ifdef _KERNEL
 
 #define	CTL_P1003_1B_MAXID		26
 
 /*
  * Declare some common oids.
  */
 extern struct sysctl_oid_list sysctl__children;
 SYSCTL_DECL(_kern);
 SYSCTL_DECL(_kern_features);
 SYSCTL_DECL(_kern_ipc);
 SYSCTL_DECL(_kern_proc);
 SYSCTL_DECL(_kern_sched);
 SYSCTL_DECL(_kern_sched_stats);
 SYSCTL_DECL(_sysctl);
 SYSCTL_DECL(_vm);
 SYSCTL_DECL(_vm_stats);
 SYSCTL_DECL(_vm_stats_misc);
 SYSCTL_DECL(_vfs);
 SYSCTL_DECL(_net);
 SYSCTL_DECL(_debug);
 SYSCTL_DECL(_debug_sizeof);
 SYSCTL_DECL(_dev);
 SYSCTL_DECL(_hw);
 SYSCTL_DECL(_hw_bus);
 SYSCTL_DECL(_hw_bus_devices);
 SYSCTL_DECL(_machdep);
 SYSCTL_DECL(_machdep_mitigations);
 SYSCTL_DECL(_user);
 SYSCTL_DECL(_compat);
 SYSCTL_DECL(_regression);
 SYSCTL_DECL(_security);
 SYSCTL_DECL(_security_bsd);
 
 extern char	machine[];
 extern char	osrelease[];
 extern char	ostype[];
 extern char	kern_ident[];
 
 /* Dynamic oid handling */
 struct sysctl_oid *sysctl_add_oid(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid_list *parent, int nbr, const char *name, int kind,
 	    void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS),
 	    const char *fmt, const char *descr, const char *label);
 int	sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del,
 	    int recurse);
 void	sysctl_rename_oid(struct sysctl_oid *oidp, const char *name);
 int	sysctl_move_oid(struct sysctl_oid *oidp,
 	    struct sysctl_oid_list *parent);
 int	sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse);
 int	sysctl_ctx_init(struct sysctl_ctx_list *clist);
 int	sysctl_ctx_free(struct sysctl_ctx_list *clist);
 struct	sysctl_ctx_entry *sysctl_ctx_entry_add(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid *oidp);
 struct	sysctl_ctx_entry *sysctl_ctx_entry_find(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid *oidp);
 int	sysctl_ctx_entry_del(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid *oidp);
 
 int	kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
 	    size_t *oldlenp, void *new, size_t newlen, size_t *retval,
 	    int flags);
 int	kernel_sysctlbyname(struct thread *td, char *name, void *old,
 	    size_t *oldlenp, void *new, size_t newlen, size_t *retval,
 	    int flags);
 int	userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
 	    size_t *oldlenp, int inkernel, const void *new, size_t newlen,
 	    size_t *retval, int flags);
 int	sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
 	    int *nindx, struct sysctl_req *req);
 void	sysctl_wlock(void);
 void	sysctl_wunlock(void);
 int	sysctl_wire_old_buffer(struct sysctl_req *req, size_t len);
 int	kern___sysctlbyname(struct thread *td, const char *name,
 	    size_t namelen, void *old, size_t *oldlenp, void *new,
 	    size_t newlen, size_t *retval, int flags, bool inkernel);
 
 struct sbuf;
 struct sbuf *sbuf_new_for_sysctl(struct sbuf *, char *, int,
 	    struct sysctl_req *);
 #else	/* !_KERNEL */
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 #ifndef _SIZE_T_DECLARED
 typedef	__size_t	size_t;
 #define	_SIZE_T_DECLARED
 #endif
 
 __BEGIN_DECLS
 int	sysctl(const int *, unsigned int, void *, size_t *, const void *, size_t);
 int	sysctlbyname(const char *, void *, size_t *, const void *, size_t);
 int	sysctlnametomib(const char *, int *, size_t *);
 __END_DECLS
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_SYSCTL_H_ */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 04596a7184ee..a3eb00f0fe7c 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -1,1205 +1,1175 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 #include <sys/_seqc.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 #define VLASTTYPE VMARKER
 
 enum vstate	{ VSTATE_UNINITIALIZED, VSTATE_CONSTRUCTED, VSTATE_DESTROYING,
 		  VSTATE_DEAD };
 #define VLASTSTATE VSTATE_DEAD
 
 enum vgetstate	{ VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT };
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 struct cache_fpl;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	i - interlock
  *	l - mp mnt_listmtx or freelist mutex
  *	I - updated with atomics, 0->1 and 1->0 transitions with interlock held
  *	m - mount point interlock
  *	p - pollinfo lock
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	enum	vtype v_type:8;			/* u vnode type */
 	enum	vstate v_state:8;		/* u vnode state */
 	short	v_irflag;			/* i frequently read flags */
 	seqc_t	v_seqc;				/* i modification count */
 	uint32_t v_nchash;			/* u namecache hash */
 	u_int	v_hash;
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 */
 	union {
 		struct mount	*v_mountedhere;	/* v ptr to mountpoint (VDIR) */
 		struct unpcb	*v_unpcb;	/* v unix domain net (VSOCK) */
 		struct cdev	*v_rdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*v_fifoinfo;	/* v fifo (VFIFO) */
 	};
 
 	/*
 	 * vfs_hash: (mount + inode) -> vnode hash.  The hash value
 	 * itself is grouped with other int fields, to avoid padding.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct namecache *v_cache_dd;		/* c Cache entry for .. vnode */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_vnodelist;		/* l vnode lists */
 	TAILQ_ENTRY(vnode) v_lazylist;		/* l vnode lazy list */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* i Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 	struct lockf *v_lockf;		/* Byte-level advisory lock list */
 	struct rangelock v_rl;			/* Byte-range lock */
 
 	u_int	v_holdcnt;			/* I prevents recycling. */
 	u_int	v_usecount;			/* I ref count of users */
 	u_short	v_iflag;			/* i vnode flags (see below) */
 	u_short	v_vflag;			/* v vnode flags */
 	u_short	v_mflag;			/* l mnt-specific vnode flags */
 	short	v_dbatchcpu;			/* i LRU requeue deferral batch */
 	int	v_writecount;			/* I ref count of writers or
 						   (negative) text users */
 	int	v_seqc_users;			/* i modifications pending */
 };
 
 #ifndef DEBUG_LOCKS
 #ifdef _LP64
 /*
  * Not crossing 448 bytes fits 9 vnodes per page. If you have to add fields
  * to the structure and there is nothing which can be done to prevent growth
  * then so be it. But don't grow it without a good reason.
  */
 _Static_assert(sizeof(struct vnode) <= 448, "vnode size crosses 448 bytes");
 #endif
 #endif
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	bo2vnode(bo)	__containerof((bo), struct vnode, v_bufobj)
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
-/*
- * Userland version of struct vnode, for sysctl.
- */
-struct xvnode {
-	size_t	xv_size;			/* sizeof(struct xvnode) */
-	void	*xv_vnode;			/* address of real vnode */
-	u_long	xv_flag;			/* vnode vflags */
-	int	xv_usecount;			/* reference count of users */
-	int	xv_writecount;			/* reference count of writers */
-	int	xv_holdcnt;			/* page & buffer references */
-	u_long	xv_id;				/* capability identifier */
-	void	*xv_mount;			/* address of parent mount */
-	long	xv_numoutput;			/* num of writes in progress */
-	enum	vtype xv_type;			/* vnode type */
-	union {
-		void	*xvu_socket;		/* unpcb, if VSOCK */
-		void	*xvu_fifo;		/* fifo, if VFIFO */
-		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
-		struct {
-			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
-			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
-		} xv_uns;
-	} xv_un;
-};
-#define xv_socket	xv_un.xvu_socket
-#define xv_fifo		xv_un.xvu_fifo
-#define xv_rdev		xv_un.xvu_rdev
-#define xv_dev		xv_un.xv_uns.xvu_dev
-#define xv_ino		xv_un.xv_uns.xvu_ino
-
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \
 			    (a) | KNF_NOKQLOCK);		\
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, KNF_LISTLOCKED)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  *
  *	VIRF_DOOMED is doubly protected by the interlock and vnode lock.  Both
  *	are required for writing but the status may be checked with either.
  */
 #define	VHOLD_NO_SMR	(1<<29)	/* Disable vhold_smr */
 #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR)
 
 #define	VIRF_DOOMED	0x0001	/* This vnode is being recycled */
 #define	VIRF_PGREAD	0x0002	/* Direct reads from the page cache are permitted,
 				   never cleared once set */
 #define	VIRF_MOUNTPOINT	0x0004	/* This vnode is mounted on */
 #define	VIRF_TEXT_REF	0x0008	/* Executable mappings ref the vnode */
 
 #define	VI_UNUSED0	0x0001	/* unused */
 #define	VI_MOUNT	0x0002	/* Mount in progress */
 #define	VI_DOINGINACT	0x0004	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x0008	/* Need to call inactive */
 #define	VI_DEFINACT	0x0010	/* deferred inactive */
 #define	VI_FOPENING	0x0020	/* In open, with opening process having the
 				   first right to advlock file */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_ETERNALDEV	0x0008	/* device that is never destroyed */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_VMSIZEVNLOCK	0x0020	/* object size check requires vnode lock */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_UNLINKED	0x0200	/* unlinked but stil open directory */
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 #define	VV_FORCEINSMQ	0x1000	/* force the insmntque to succeed */
 #define	VV_READLINK	0x2000	/* fdescfs linux vnode */
 #define	VV_UNREF	0x4000	/* vunref, do not drop lock in inactive() */
 #define	VV_CROSSLOCK	0x8000	/* vnode lock is shared w/ root mounted here */
 
 #define	VMP_LAZYLIST	0x0001	/* Vnode is on mnt's lazy list */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	u_short		va_padding0;
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	nlink_t		va_nlink;	/* number of references to file */
 	dev_t		va_fsid;	/* filesystem id */
 	ino_t		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_SYNC		0x04		/* O_SYNC truncation */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_NOREUSE	0x0200		/* VMIO data won't be reused */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 #define	IO_BUFLOCKED	0x2000		/* ffs flag; indir buf is locked */
 #define	IO_RANGELOCKED	0x4000		/* range locked */
 #define	IO_DATASYNC	0x8000		/* do only data I/O synchronously */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  * Flags for accmode_t.
  */
 #define	VEXEC			000000000100 /* execute/search permission */
 #define	VWRITE			000000000200 /* write permission */
 #define	VREAD			000000000400 /* read permission */
 #define	VADMIN			000000010000 /* being the file owner */
 #define	VAPPEND			000000040000 /* permission to write/append */
 /*
  * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
  * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
  * and 0 otherwise.  This never happens with ordinary unix access rights
  * or POSIX.1e ACLs.  Obviously, VEXPLICIT_DENY must be OR-ed with
  * some other V* constant.
  */
 #define	VEXPLICIT_DENY		000000100000
 #define	VREAD_NAMED_ATTRS 	000000200000 /* not used */
 #define	VWRITE_NAMED_ATTRS 	000000400000 /* not used */
 #define	VDELETE_CHILD	 	000001000000
 #define	VREAD_ATTRIBUTES 	000002000000 /* permission to stat(2) */
 #define	VWRITE_ATTRIBUTES 	000004000000 /* change {m,c,a}time */
 #define	VDELETE		 	000010000000
 #define	VREAD_ACL	 	000020000000 /* read ACL and file mode */
 #define	VWRITE_ACL	 	000040000000 /* change ACL and/or file mode */
 #define	VWRITE_OWNER	 	000100000000 /* change file owner */
 #define	VSYNCHRONIZE	 	000200000000 /* not used */
 #define	VCREAT			000400000000 /* creating new file */
 #define	VVERIFY			001000000000 /* verification required */
 
 /*
  * Permissions that were traditionally granted only to the file owner.
  */
 #define VADMIN_PERMS	(VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
     VWRITE_OWNER)
 
 /*
  * Permissions that were traditionally granted to everyone.
  */
 #define VSTAT_PERMS	(VREAD_ATTRIBUTES | VREAD_ACL)
 
 /*
  * Permissions that allow to change the state of the file in any way.
  */
 #define VMODIFY_PERMS	(VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
     VDELETE)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 extern u_int ncsizefactor;
 extern const u_int io_hold_cnt;
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	EARLYFLUSH	0x0008	/* vflush: early call for ffs_flushfiles */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	V_VMIO		0x0010	/* vinvalbuf: called during pageout */
 #define	V_ALLOWCLEAN	0x0020	/* vinvalbuf: allow clean buffers after flush */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 #define	V_PCATCH	0x0008	/* vn_start_write: make the sleep interruptible */
 #define	V_VALID_FLAGS (V_WAIT | V_NOWAIT | V_XSLEEP | V_PCATCH)
 
 #define	VR_START_WRITE	0x0001	/* vfs_write_resume: start write atomically */
 #define	VR_NO_SUSPCLR	0x0002	/* vfs_write_resume: do not clear suspension */
 
 #define	VS_SKIP_UNMOUNT	0x0001	/* vfs_write_suspend: fail if the
 				   filesystem is being unmounted */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	struct mount *rootdevmp;	/* "/dev" mount */
 extern	u_long desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 extern	u_int vn_lock_pair_pause_max;
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #define	VN_LOCK_AREC(vp)	lockallowrecurse((vp)->v_vnlock)
 #define	VN_LOCK_ASHARE(vp)	lockallowshare((vp)->v_vnlock)
 #define	VN_LOCK_DSHARE(vp)	lockdisableshare((vp)->v_vnlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	int	vdesc_vop_offset;
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.  Note that the unreliability is
  * limited to false negatives; efforts were made to ensure that false
  * positives cannot occur.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 void	assert_vop_locked(struct vnode *vp, const char *str);
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #define ASSERT_VOP_IN_SEQC(vp)	do {				\
 	struct vnode *_vp = (vp);				\
 								\
 	VNPASS(seqc_in_modify(_vp->v_seqc), _vp);		\
 } while (0)
 
 #define ASSERT_VOP_NOT_IN_SEQC(vp)	do {			\
 	struct vnode *_vp = (vp);				\
 								\
 	VNPASS(!seqc_in_modify(_vp->v_seqc), _vp);		\
 } while (0)
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VI_UNLOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_ELOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_UNLOCKED(vp, str)	((void)0)
 
 #define ASSERT_VOP_IN_SEQC(vp)		((void)0)
 #define ASSERT_VOP_NOT_IN_SEQC(vp)	((void)0)
 
 #endif /* DEBUG_VFS_LOCKS */
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 #define DOINGASYNC(vp)	   					\
 	(((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 &&	\
 	 ((curthread->td_pflags & TDP_SYNCIO) == 0))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #include "vnode_if.h"
 
 /* vn_open_flags */
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
 #define	VN_OPEN_INVFS		0x00000008
 #define	VN_OPEN_WANTIOCTLCAPS	0x00000010
 
 /* copy_file_range kernel flags */
 #define	COPY_FILE_RANGE_KFLAGS		0xff000000
 #define	COPY_FILE_RANGE_TIMEO1SEC	0x01000000	/* Return after 1sec. */
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct freebsd11_stat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vfsops;
 struct vnode;
 
 typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
 
 int	bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn,
 	    daddr_t endn);
 /* cache_* may belong in namei.h. */
 void	cache_changesize(u_long newhashsize);
 
 #define	VFS_CACHE_DROPOLD	0x1
 
 void	cache_enter_time_flags(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp, int flags);
 #define	cache_enter(dvp, vp, cnp)					\
 	cache_enter_time(dvp, vp, cnp, NULL, NULL)
 void	cache_enter_time(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp, struct timespec *tsp, int *ticksp);
 void	cache_vnode_init(struct vnode *vp);
 void	cache_purge(struct vnode *vp);
 void	cache_purge_vgone(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp);
 char	*cache_symlink_alloc(size_t size, int flags);
 void	cache_symlink_free(char *string, size_t size);
 int	cache_symlink_resolve(struct cache_fpl *fpl, const char *string,
 	    size_t len);
 void	cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp);
 void	cache_vop_rmdir(struct vnode *dvp, struct vnode *vp);
 #ifdef INVARIANTS
 void	cache_validate(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 void	cache_assert_no_entries(struct vnode *vp);
 #else
 static inline void
 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 }
 
 static inline void
 cache_assert_no_entries(struct vnode *vp)
 {
 }
 #endif
 void	cache_fast_lookup_enabled_recalc(void);
 int	change_dir(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 int	freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
 int	freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 void	getnewvnode_reserve(void);
 void	getnewvnode_drop_reserve(void);
 int	insmntque(struct vnode *vp, struct mount *mp);
 int	insmntque1(struct vnode *vp, struct mount *mp);
 u_quad_t init_va_filerev(void);
 int	speedup_syncer(void);
 int	vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen);
 int	vn_getcwd(char *buf, char **retbuf, size_t *buflen);
 int	vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf);
 int	vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf);
 int	vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
 	    const char *hdrl_name, size_t hrdl_name_length, char **retbuf,
 	    char **freebuf, size_t *buflen);
 struct vnode *
 	vn_dir_dd_ino(struct vnode *vp);
 int	vn_commname(struct vnode *vn, char *buf, u_int buflen);
 int	vn_path_to_global_path(struct thread *td, struct vnode *vp,
 	    char *path, u_int pathlen);
 int	vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
 	    struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
 	    size_t leaf_length);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, accmode_t accmode, struct ucred *cred);
 int	vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid,
 	    struct ucred *cred);
 int	vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
 	    struct acl *aclp, accmode_t accmode, struct ucred *cred);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, accmode_t accmode,
 	    struct ucred *cred);
 void	vattr_null(struct vattr *vap);
 void	vlazy(struct vnode *);
 void	vdrop(struct vnode *);
 void	vdropl(struct vnode *);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int flags);
 enum vgetstate	vget_prep_smr(struct vnode *vp);
 enum vgetstate	vget_prep(struct vnode *vp);
 int	vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
 void	vget_finish_ref(struct vnode *vp, enum vgetstate vs);
 void	vget_abort(struct vnode *vp, enum vgetstate vs);
 void	vgone(struct vnode *vp);
 void	vhold(struct vnode *);
 void	vholdnz(struct vnode *);
 bool	vhold_smr(struct vnode *);
 int	vinactive(struct vnode *vp);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, off_t length, int blksize);
 void	v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
 	    int blksize);
 void	vunref(struct vnode *);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 int	vrecycle(struct vnode *vp);
 int	vrecyclel(struct vnode *vp);
 int	vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 int	vn_copy_file_range(struct vnode *invp, off_t *inoffp,
 	    struct vnode *outvp, off_t *outoffp, size_t *lenp,
 	    unsigned int flags, struct ucred *incred, struct ucred *outcred,
 	    struct thread *fsize_td);
 int	vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
 	    int ioflg, struct ucred *active_cred, struct ucred *file_cred);
 void	vn_finished_write(struct mount *mp);
 void	vn_finished_secondary_write(struct mount *mp);
 int	vn_fsync_buf(struct vnode *vp, int waitfor);
 int	vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
 	    struct vnode *outvp, off_t *outoffp, size_t *lenp,
 	    unsigned int flags, struct ucred *incred, struct ucred *outcred,
 	    struct thread *fsize_td);
 int	vn_need_pageq_flush(struct vnode *vp);
 bool	vn_isdisk_error(struct vnode *vp, int *errp);
 bool	vn_isdisk(struct vnode *vp);
 int	_vn_lock(struct vnode *vp, int flags, const char *file, int line);
 #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
 void	vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
 	    bool vp2_locked);
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    u_int vn_open_flags, struct ucred *cred, struct file *fp);
 int	vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	    struct thread *td, struct file *fp);
 void	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
 void	vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start,
 	    vm_pindex_t end);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_read_from_obj(struct vnode *vp, struct uio *uio);
 int	vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
 	    struct thread *td);
 int	vn_rlimit_fsizex(const struct vnode *vp, struct uio *uio,
 	    off_t maxfsz, ssize_t *resid_adj, struct thread *td);
 void	vn_rlimit_fsizex_res(struct uio *uio, ssize_t resid_adj);
 int	vn_rlimit_trunc(u_quad_t size, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
 	    int flags);
 int	vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
 	    struct ucred *cred);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
 int	vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc,
 	    void *alloc_arg, int lkflags, struct vnode **rvp);
 int	vn_utimes_perm(struct vnode *vp, struct vattr *vap,
 	    struct ucred *cred, struct thread *td);
 
 int	vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio);
 int	vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 	    struct uio *uio);
 
 void	vn_seqc_write_begin_locked(struct vnode *vp);
 void	vn_seqc_write_begin(struct vnode *vp);
 void	vn_seqc_write_end_locked(struct vnode *vp);
 void	vn_seqc_write_end(struct vnode *vp);
 #define	vn_seqc_read_any(vp)		seqc_read_any(&(vp)->v_seqc)
 #define	vn_seqc_read_notmodify(vp)	seqc_read_notmodify(&(vp)->v_seqc)
 #define	vn_seqc_consistent(vp, seq)	seqc_consistent(&(vp)->v_seqc, seq)
 
 #define	vn_rangelock_unlock(vp, cookie)					\
 	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
 #define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
 	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
 	    VI_MTX(vp))
 #define	vn_rangelock_rlock(vp, start, end)				\
 	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_tryrlock(vp, start, end)				\
 	rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_wlock(vp, start, end)				\
 	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_trywlock(vp, start, end)				\
 	rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 
 #define	vn_irflag_read(vp)	atomic_load_short(&(vp)->v_irflag)
 void	vn_irflag_set_locked(struct vnode *vp, short toset);
 void	vn_irflag_set(struct vnode *vp, short toset);
 void	vn_irflag_set_cond_locked(struct vnode *vp, short toset);
 void	vn_irflag_set_cond(struct vnode *vp, short toset);
 void	vn_irflag_unset_locked(struct vnode *vp, short tounset);
 void	vn_irflag_unset(struct vnode *vp, short tounset);
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 int	vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
 struct vnode *vnlru_alloc_marker(void);
 void	vnlru_free_marker(struct vnode *);
 void	vnlru_free_vfsops(int, struct vfsops *, struct vnode *);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdioctl(struct vop_ioctl_args *);
 int	vop_stdneed_inactive(struct vop_need_inactive_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_lock(struct vop_lock1_args *);
 int	vop_unlock(struct vop_unlock_args *);
 int	vop_islocked(struct vop_islocked_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
 int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
 int	vop_stdallocate(struct vop_allocate_args *ap);
 int	vop_stddeallocate(struct vop_deallocate_args *ap);
 int	vop_stdset_text(struct vop_set_text_args *ap);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
 int	vop_stdunp_bind(struct vop_unp_bind_args *ap);
 int	vop_stdunp_connect(struct vop_unp_connect_args *ap);
 int	vop_stdunp_detach(struct vop_unp_detach_args *ap);
 int	vop_stdadd_writecount_nomsync(struct vop_add_writecount_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enoent(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_eagain(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 int	dead_poll(struct vop_poll_args *ap);
 int	dead_read(struct vop_read_args *ap);
 int	dead_write(struct vop_write_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_close_post(void *a, int rc);
 void	vop_create_pre(void *a);
 void	vop_create_post(void *a, int rc);
 void	vop_whiteout_pre(void *a);
 void	vop_whiteout_post(void *a, int rc);
 void	vop_deleteextattr_pre(void *a);
 void	vop_deleteextattr_post(void *a, int rc);
 void	vop_link_pre(void *a);
 void	vop_link_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_pre(void *a);
 void	vop_mknod_post(void *a, int rc);
 void	vop_open_post(void *a, int rc);
 void	vop_read_post(void *a, int rc);
 void	vop_read_pgcache_post(void *ap, int rc);
 void	vop_readdir_post(void *a, int rc);
 void	vop_reclaim_post(void *a, int rc);
 void	vop_remove_pre(void *a);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_pre(void *a);
 void	vop_setattr_post(void *a, int rc);
 void	vop_setacl_pre(void *a);
 void	vop_setacl_post(void *a, int rc);
 void	vop_setextattr_pre(void *a);
 void	vop_setextattr_post(void *a, int rc);
 void	vop_symlink_pre(void *a);
 void	vop_symlink_post(void *a, int rc);
 int	vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a);
 
 #ifdef DEBUG_VFS_LOCKS
 void	vop_fdatasync_debugpre(void *a);
 void	vop_fdatasync_debugpost(void *a, int rc);
 void	vop_fplookup_vexec_debugpre(void *a);
 void	vop_fplookup_vexec_debugpost(void *a, int rc);
 void	vop_fplookup_symlink_debugpre(void *a);
 void	vop_fplookup_symlink_debugpost(void *a, int rc);
 void	vop_fsync_debugpre(void *a);
 void	vop_fsync_debugpost(void *a, int rc);
 void	vop_strategy_debugpre(void *a);
 void	vop_lock_debugpre(void *a);
 void	vop_lock_debugpost(void *a, int rc);
 void	vop_unlock_debugpre(void *a);
 void	vop_need_inactive_debugpre(void *a);
 void	vop_need_inactive_debugpost(void *a, int rc);
 void	vop_mkdir_debugpost(void *a, int rc);
 #else
 #define	vop_fdatasync_debugpre(x)		do { } while (0)
 #define	vop_fdatasync_debugpost(x, y)		do { } while (0)
 #define	vop_fplookup_vexec_debugpre(x)		do { } while (0)
 #define	vop_fplookup_vexec_debugpost(x, y)	do { } while (0)
 #define	vop_fplookup_symlink_debugpre(x)	do { } while (0)
 #define	vop_fplookup_symlink_debugpost(x, y)	do { } while (0)
 #define	vop_fsync_debugpre(x)			do { } while (0)
 #define	vop_fsync_debugpost(x, y)		do { } while (0)
 #define	vop_strategy_debugpre(x)		do { } while (0)
 #define	vop_lock_debugpre(x)			do { } while (0)
 #define	vop_lock_debugpost(x, y)		do { } while (0)
 #define	vop_unlock_debugpre(x)			do { } while (0)
 #define	vop_need_inactive_debugpre(x)		do { } while (0)
 #define	vop_need_inactive_debugpost(x, y)	do { } while (0)
 #define	vop_mkdir_debugpost(x, y)		do { } while (0)
 #endif
 
 void	vop_rename_fail(struct vop_rename_args *ap);
 
 #define	vop_stat_helper_pre(ap)	({						\
 	struct vop_stat_args *_ap = (ap);					\
 	int _error;								\
 	AUDIT_ARG_VNODE1(ap->a_vp);						\
 	_error = mac_vnode_check_stat(_ap->a_active_cred, _ap->a_file_cred, _ap->a_vp);\
 	if (__predict_true(_error == 0)) {					\
 		ap->a_sb->st_padding0 = 0;					\
 		ap->a_sb->st_padding1 = 0;					\
 		bzero(_ap->a_sb->st_spare, sizeof(_ap->a_sb->st_spare));	\
 	}									\
 	_error;									\
 })
 
 #define	vop_stat_helper_post(ap, error)	({					\
 	struct vop_stat_args *_ap = (ap);					\
 	int _error = (error);							\
 	if (priv_check_cred_vfs_generation(_ap->a_active_cred))			\
 		_ap->a_sb->st_gen = 0;						\
 	_error;									\
 })
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error;							\
 	off_t osize, ooffset, noffset;					\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred);	\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = (off_t)va.va_size;				\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
 
 #ifdef INVARIANTS
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)				\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_ADD_WRITECOUNT((vp), (cnt));			\
 	VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_SET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_SET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_UNSET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_UNSET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #else
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)	VOP_ADD_WRITECOUNT((vp), (cnt))
 #define	VOP_SET_TEXT_CHECKED(vp)		VOP_SET_TEXT((vp))
 #define	VOP_UNSET_TEXT_CHECKED(vp)		VOP_UNSET_TEXT((vp))
 #endif
 
 #define	VN_IS_DOOMED(vp)	__predict_false((vn_irflag_read(vp) & VIRF_DOOMED) != 0)
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 void	vrefact(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 static __inline int
 vrefcnt(struct vnode *vp)
 {
 
 	return (vp->v_usecount);
 }
 
 #define	vholdl(vp)	do {						\
 	ASSERT_VI_LOCKED(vp, __func__);					\
 	vhold(vp);							\
 } while (0)
 
 #define	vrefl(vp)	do {						\
 	ASSERT_VI_LOCKED(vp, __func__);					\
 	vref(vp);							\
 } while (0)
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_ENOENT	((void*)(uintptr_t)vop_enoent)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 #define VOP_EAGAIN	((void*)(uintptr_t)vop_eagain)
 
 /* fifo_vnops.c */
 int	fifo_printinfo(struct vnode *);
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 void vfs_hash_changesize(u_long newhashsize);
 int vfs_hash_get(const struct mount *mp, u_int hash, int flags,
     struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 u_int vfs_hash_index(struct vnode *vp);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 int vfs_emptydir(struct vnode *vp);
 
 int vfs_unixify_accmode(accmode_t *accmode);
 
 void vfs_unp_reclaim(struct vnode *vp);
 
 int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode);
 int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid);
 int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td);
 int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td);
 int vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *active_cred);
 int vn_getsize(struct vnode *vp, off_t *size, struct ucred *active_cred);
 
 void vn_fsid(struct vnode *vp, struct vattr *va);
 
 int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp);
 int vn_lktype_write(struct mount *mp, struct vnode *vp);
 
 #ifdef INVARIANTS
 void vn_set_state_validate(struct vnode *vp, enum vstate state);
 #endif
 
 static inline void
 vn_set_state(struct vnode *vp, enum vstate state)
 {
 #ifdef INVARIANTS
 	vn_set_state_validate(vp, state);
 #endif
 	vp->v_state = state;
 }
 
 static inline enum vstate
 vn_get_state(struct vnode *vp)
 {
 	return (vp->v_state);
 }
 
 #define VOP_UNLOCK_FLAGS(vp, flags)	({				\
 	struct vnode *_vp = (vp);					\
 	int _flags = (flags);						\
 	int _error;							\
 									\
         if ((_flags & ~(LK_INTERLOCK | LK_RELEASE)) != 0)		\
                 panic("%s: unsupported flags %x\n", __func__, flags);	\
         _error = VOP_UNLOCK(_vp);					\
         if (_flags & LK_INTERLOCK)					\
                 VI_UNLOCK(_vp);						\
         _error;								\
 })
 
 #include <sys/kernel.h>
 
 #define VFS_VOP_VECTOR_REGISTER(vnodeops) \
 	SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \
 	    vfs_vector_op_register, &vnodeops)
 
 #define VFS_SMR_DECLARE				\
 	extern smr_t vfs_smr
 
 #define VFS_SMR()	vfs_smr
 #define vfs_smr_enter()	smr_enter(VFS_SMR())
 #define vfs_smr_exit()	smr_exit(VFS_SMR())
 #define vfs_smr_synchronize()	smr_synchronize(VFS_SMR())
 #define vfs_smr_entered_load(ptr)	smr_entered_load((ptr), VFS_SMR())
 #define VFS_SMR_ASSERT_ENTERED()	SMR_ASSERT_ENTERED(VFS_SMR())
 #define VFS_SMR_ASSERT_NOT_ENTERED()	SMR_ASSERT_NOT_ENTERED(VFS_SMR())
 #define VFS_SMR_ZONE_SET(zone)	uma_zone_set_smr((zone), VFS_SMR())
 
 #define vn_load_v_data_smr(vp)	({		\
 	struct vnode *_vp = (vp);		\
 						\
 	VFS_SMR_ASSERT_ENTERED();		\
 	atomic_load_consume_ptr(&(_vp)->v_data);\
 })
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */